In [26]:
import polars as pl
import time

start = time.time()

MAX_VALUE = 20
MAX_CHARACTERS = 500

try:
    # Read CSV with explicit schema
    df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',
        dtypes={
            'Year': pl.Int32,
            'Month': pl.Int32,
            'Day': pl.Int32,
            'Crop': pl.Utf8,
            'DistrictName': pl.Utf8,
            'QueryType': pl.Utf8,
            'Season': pl.Utf8,
            'Sector': pl.Utf8,
            'StateName': pl.Utf8,
            'QueryText': pl.Utf8,
            'KccAns': pl.Utf8,
            'Category': pl.Utf8,
            'BlockName': pl.Utf8
        },
        low_memory=True,
        infer_schema_length=10000
    ).drop(['BlockName', 'Category'])
    
    # Drop rows with missing values in QueryText or KccAns
    df_clean = df.drop_nulls(subset=['QueryText', 'KccAns'])
    
    # Print info about both dataframes
    print("\nOriginal DataFrame Info:")
    print(f"Number of rows: {len(df):,}")
    print(f"Memory usage: {df.estimated_size() / (1024**3):.2f} GB")
    
    print("\nCleaned DataFrame Info:")
    print(f"Number of rows: {len(df_clean):,}")
    print(f"Memory usage: {df_clean.estimated_size() / (1024**3):.2f} GB")
    print(f"Rows removed: {len(df) - len(df_clean):,}")
    print(f"Time taken: {time.time() - start:.2f} seconds")
    print("\nColumns:", df_clean.columns)

except Exception as e:
    print(f"Error: {e}")


# Get value counts for KccAns
top_answers_KccAns = (
    df_clean
    .select(pl.col('KccAns'))
    .group_by('KccAns')
    .count()
    .sort('count', descending=True)
    .limit(MAX_VALUE)
)

print("\nTop 10 most frequent answers in KccAns:")
print("=======================================")
total_rows = len(df_clean)
for row in top_answers_KccAns.iter_rows():
    answer, count = row
    percentage = (count / total_rows) * 100
    print(f"\nCount: {count:,} ({percentage:.2f}%)")
    # Print first 200 characters of answer to keep output readable
    print(f"Answer: {answer[:MAX_CHARACTERS]}...")



# Get value counts for KccAns
top_answers_QueryText = (
    df_clean
    .select(pl.col('QueryText'))
    .group_by('QueryText')
    .count()
    .sort('count', descending=True)
    .limit(MAX_VALUE)
)

print("\nTop 10 most frequent answers in QueryText:")
print("=======================================")
total_rows = len(df_clean)
for row in top_answers_QueryText.iter_rows():
    answer, count = row
    percentage = (count / total_rows) * 100
    print(f"\nCount: {count:,} ({percentage:.2f}%)")
    # Print first 200 characters of answer to keep output readable
    print(f"Answer: {answer[:MAX_CHARACTERS]}...")




# First, get the top values as lists
top_kcc_answers = [row[0] for row in top_answers_KccAns.iter_rows()]
top_queries = [row[0] for row in top_answers_QueryText.iter_rows()]

# Remove rows that contain any of the top answers or queries
df_filtered = df_clean.filter(
   (~pl.col('KccAns').is_in(top_kcc_answers)) & 
   (~pl.col('QueryText').is_in(top_queries))
)

# Print stats about filtered dataframe
print("\nDataframe after removing top answers and queries:")
print(f"Original number of rows: {len(df_clean):,}")
print(f"Rows after filtering: {len(df_filtered):,}")
print(f"Rows removed: {len(df_clean) - len(df_filtered):,}")
print(f"Memory usage: {df_filtered.estimated_size() / (1024**3):.2f} GB")

  df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',



Original DataFrame Info:
Number of rows: 41,987,874
Memory usage: 6.30 GB

Cleaned DataFrame Info:
Number of rows: 37,665,904
Memory usage: 5.87 GB
Rows removed: 4,321,970
Time taken: 1.95 seconds

Columns: ['Year', 'Month', 'Day', 'Crop', 'DistrictName', 'QueryType', 'Season', 'Sector', 'StateName', 'QueryText', 'KccAns']


  df_clean



Top 10 most frequent answers in KccAns:

Count: 180,385 (0.48%)
Answer: NO RAIN POSSIBILITY IN NEXT 3-4 DAYS BUT CLOUDY SKY...

Count: 138,012 (0.37%)
Answer: NO RAIN POSSIBILITY IN NEXT 5 DAYS BUT CLOUDY SKY...

Count: 121,635 (0.32%)
Answer: Some clouds and chance of rain fall today...

Count: 112,312 (0.30%)
Answer: some clouds and no chance of rainfall today...

Count: 104,046 (0.28%)
Answer: all information provided - thanks for calling in kisan call centre...

Count: 97,658 (0.26%)
Answer:          ...

Count: 94,204 (0.25%)
Answer: NO RAIN POSSIBILITY IN NEXT 5 DAYS...

Count: 76,080 (0.20%)
Answer: weather is cloudy but no chances of rainfall today...

Count: 73,729 (0.20%)
Answer:         ...

Count: 73,666 (0.20%)
Answer:                  ...

Count: 73,230 (0.19%)
Answer:       ...

Count: 72,960 (0.19%)
Answer:             ...

Count: 67,502 (0.18%)
Answer:            ...

Count: 67,281 (0.18%)
Answer: Your Registration process successfully completed...

Count: 66,642 (0.1

  df_clean



Top 10 most frequent answers in QueryText:

Count: 4,698,473 (12.47%)
Answer: Farmer asked query on Weather...

Count: 832,804 (2.21%)
Answer: TELL ME ABOUT WEATHER INFORMATION ...

Count: 375,914 (1.00%)
Answer: Asking about weather forecast...

Count: 213,078 (0.57%)
Answer: weather report...

Count: 199,469 (0.53%)
Answer: weather information...

Count: 193,288 (0.51%)
Answer: Asking about weather forecast ...

Count: 182,086 (0.48%)
Answer: WEATHER REPORT...

Count: 178,518 (0.47%)
Answer: TELL ME WEATHER INFORMATION...

Count: 177,374 (0.47%)
Answer: WEATHER INFORMATION...

Count: 176,098 (0.47%)
Answer: Weather information...

Count: 157,399 (0.42%)
Answer: information regarding weather forecasting...

Count: 136,110 (0.36%)
Answer: weather information ...

Count: 127,310 (0.34%)
Answer: weather...

Count: 99,458 (0.26%)
Answer: Asked About SMS Activation...

Count: 97,832 (0.26%)
Answer: Information regarding weather in ...

Count: 97,741 (0.26%)
Answer: Asking about weather fo

In [28]:
df_filtered.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns
i32,i32,i32,str,str,str,str,str,str,str,str
2006,1,17,"""1275""","""SAGAR""","""99""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control flower drop in …","""spray planofix4mlpump"""
2006,1,17,"""1279""","""SAGAR""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control fruit borer in …","""should be spray profenophos 35…"
2006,1,17,"""1064""","""SAGAR""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control of yellow moisa…","""should be spray metasystox 35m…"
2006,1,17,"""1279""","""DAMOH""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control white fly in br…","""should be spray metasystox 35m…"
2006,1,17,"""Wheat""","""DAMOH""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control termite in whea…","""use chlorpyrephos1lithactwith …"


In [36]:
df_filtered['Sector'].value_counts()

Sector,count
str,u32
"""AGRICULTURE""",20287938
"""825""",9747
"""9999""",280875
"""HORTICULTURE""",7490726
"""FISHERIES""",68049
"""ANIMAL HUSBANDRY""",360098
,41305


In [35]:
crop_counts = (
    df_filtered
    .select(pl.col('Crop'))
    .group_by('Crop')
    .count()
    .sort('count', descending=True)
)

print("\nCrop value counts:")
print("==================")
total_rows = len(df_filtered)
for row in crop_counts.iter_rows():
    crop, count = row
    percentage = (count / total_rows) * 100
    print(f"\nCrop: {crop}")
    print(f"Count: {count:,} ({percentage:.2f}%)")

  df_filtered



Crop value counts:

Crop: Others
Count: 9,363,229 (32.81%)

Crop: Paddy Dhan
Count: 2,592,974 (9.09%)

Crop: Wheat
Count: 2,177,374 (7.63%)

Crop: Cotton Kapas
Count: 1,129,973 (3.96%)

Crop: Onion
Count: 562,949 (1.97%)

Crop: Chillies
Count: 550,562 (1.93%)

Crop: Brinjal
Count: 461,336 (1.62%)

Crop: Tomato
Count: 448,988 (1.57%)

Crop: Sugarcane Noble Cane
Count: 436,388 (1.53%)

Crop: Bengal Gram GramChick PeaKabuliChana
Count: 419,572 (1.47%)

Crop: Soybean bhat
Count: 412,511 (1.45%)

Crop: Groundnut pea nutmung phalli
Count: 412,024 (1.44%)

Crop: Mustard
Count: 382,958 (1.34%)

Crop: Potato
Count: 357,824 (1.25%)

Crop: Green Gram Moong Bean Moong
Count: 332,282 (1.16%)

Crop: Mango
Count: 307,343 (1.08%)

Crop: Maize Makka
Count: 290,619 (1.02%)

Crop: 9999
Count: 280,875 (0.98%)

Crop: BhindiOkraLadysfinger
Count: 271,222 (0.95%)

Crop: 1137
Count: 254,153 (0.89%)

Crop: Black Gram urd bean
Count: 240,419 (0.84%)

Crop: Apple
Count: 226,151 (0.79%)

Crop: BovineCowBuffalo
C

In [2]:
import pathlib as Path 
print(f'Current working directory: {Path.Path.cwd()}')

import dask.dataframe as dd # type: ignore

Current working directory: /home/manimala/Documents/satyakama/paper-farmer-chatbot


In [3]:
import numpy as np

dtypes = {
    'Year': np.int64,
    'Month': np.int64,
    'Day': np.int64,
    'Crop': 'string[pyarrow]',
    'DistrictName': 'string[pyarrow]',
    'QueryType': 'string[pyarrow]',
    'Season': 'string[pyarrow]',
    'Sector': 'string[pyarrow]',
    'StateName': 'string[pyarrow]',
    'QueryText': 'string[pyarrow]',
    'KccAns': 'string[pyarrow]',
    'BlockName': 'string[pyarrow]',  # Added this
    'Category': 'string[pyarrow]'    # Added this
}

master_df = dd.read_csv(
    'dataset/original_dataset/kcc_dataset.csv',
    dtype=dtypes,
    blocksize='128MB',
    usecols=lambda col: col not in ['BlockName', 'Category']
)

# Now get the row count
row_count_master_df = master_df.shape[0].compute()

print(f'Original number of rows in master_df: {row_count_master_df}')

Original number of rows in master_df: 41987874


In [4]:
master_df_completeQApairs = master_df.dropna(subset=['QueryText', 'KccAns'])

print(f'Original number of rows in master_df_completeQApairs: {master_df_completeQApairs.shape[0].compute()}')

Original number of rows in master_df_completeQApairs: 37665904


In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = master_df_completeQApairs['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(15).compute()
    
    print("\nTop 15 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")


Top 10 most frequent queries:
Count: 4698473, Query: Farmer asked query on Weather
Count: 832804, Query: TELL ME ABOUT WEATHER INFORMATION 
Count: 375914, Query: Asking about weather forecast
Count: 213078, Query: weather report
Count: 199469, Query: weather information
Count: 193288, Query: Asking about weather forecast 
Count: 182086, Query: WEATHER REPORT
Count: 178518, Query: TELL ME WEATHER INFORMATION
Count: 177374, Query: WEATHER INFORMATION
Count: 176098, Query: Weather information
Count: 157399, Query: information regarding weather forecasting
Count: 136110, Query: weather information 
Count: 127310, Query: weather
Count: 99458, Query: Asked About SMS Activation
Count: 97832, Query: Information regarding weather in 


In [6]:
# Get the list of top 10 queries as a set for faster lookup
top_query_set = set(top_queries.index)

# Create a filter to exclude top queries
master_df_removeWeather = master_df_completeQApairs[~master_df_completeQApairs['QueryText'].isin(top_query_set)]

In [7]:
# Check the new size
print(f'Original number of rows: {master_df.shape[0].compute():,}')
print(f'Original number of rows in master_df_completeQApairs: {master_df_completeQApairs.shape[0].compute()}')
print(f'Rows after removing top queries: {master_df_removeWeather.shape[0].compute():,}')
# print(f'Rows removed: {(master_df_completeQApairs.shape[0].compute() - filtered_df.shape[0].compute()):,}')

Original number of rows: 41,987,874
Original number of rows in master_df_completeQApairs: 37665904
Rows after removing top queries: 29,820,693


In [12]:
import gc

# Delete previous large dataframes if they exist
if 'master_df' in locals():
    del master_df
if 'master_df_completeQApairs' in locals():
    del master_df_completeQApairs
if 'query_series' in locals():
    del query_series
if 'top_queries' in locals():
    del top_queries

# Force garbage collection
gc.collect()

857

In [13]:
try:
    # Set a smaller number of partitions to reduce memory overhead
    master_df_removeWeather = master_df_removeWeather.repartition(npartitions=100)
    
    # Process in chunks using map_partitions
    top_queries_KccAns = (
        master_df_removeWeather['KccAns']
        .map_partitions(lambda x: x.value_counts())
        .compute()
        .nlargest(10)
    )
    
    print("\nTop 10 most frequent answers in KccAns:")
    for answer, count in top_queries_KccAns.items():
        print(f"Count: {count:,}, Answer: {answer}")

except Exception as e:
    print(f"Error: {e}")

Error: Unable to allocate 78.5 TiB for an array with shape (10289501,) and data type <U2097814


In [10]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series_KccAns = master_df_removeWeather['KccAns'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries_KccAns = query_series_KccAns.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries in KccAns:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

Error: Unable to allocate 3.53 TiB for an array with shape (462138,) and data type <U2097814


In [8]:
master_df_removeWeather.columns

Index(['Year', 'Month', 'Day', 'Crop', 'DistrictName', 'QueryType', 'Season',
       'Sector', 'StateName', 'QueryText', 'KccAns'],
      dtype='object')

In [None]:
set(top_queries.index)

In [None]:
# Filter out rows containing 'Call Disconnected'
cleaned_df_completeQApairs_dropCallDisconnected = cleaned_df_completeQApairs[
    ~(cleaned_df_completeQApairs['QueryText'].str.contains('Call Disconnected', case=False, na=False)) &
    ~(cleaned_df_completeQApairs['KccAns'].str.contains('Call Disconnected', case=False, na=False))
]

# Check the row counts
original_count = len(cleaned_df_completeKccAns.compute())
new_count = len(cleaned_df_completeKccAns_dropCallDisconnected.compute())

print(f'Number of rows before filtering: {original_count}')
print(f'Number of rows after filtering: {new_count}')
print(f'Number of rows removed: {original_count - new_count}')
print(f'Percentage of rows removed: {((original_count - new_count) / original_count * 100):.2f}%')

In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = cleaned_df_completeKccAns_dropCallDisconnected['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
cleaned_df_completeKccAns.head()

In [None]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')


# Calculate the percentage of NaN values
nan_percentage_kccAns = (master_df['KccAns'].isna().sum() / len(master_df) * 100).compute()

print(f'Percentage of NaN values in KccAns: {nan_percentage_kccAns:.2f}%')

In [None]:
# Drop column names
master_df = master_df.drop(columns=['BlockName', 'Category'])

In [None]:
# Count rows where any column has NaN
rows_with_nan = master_df.isna().any(axis=1).sum().compute()

# Get total number of rows
total_rows = len(master_df.compute())

# Calculate percentage
nan_percentage = (rows_with_nan / total_rows) * 100

print(f'Total number of rows: {total_rows}')
print(f'Number of rows with at least one NaN: {rows_with_nan}')
print(f'Percentage of rows with at least one NaN: {nan_percentage:.2f}%')

In [None]:
# Get NaN count for each column
column_nan_counts = master_df.isna().sum().compute()
column_nan_percentages = (column_nan_counts / total_rows * 100)

print("\nNaN distribution by column:")
for column in master_df.columns:
    count = column_nan_counts[column]
    percentage = column_nan_percentages[column]
    print(f'{column}: {count} NaN values ({percentage:.2f}%)')

In [None]:
master_df.head(25)

In [None]:
import dask
import dask.dataframe as dd

import os
import tqdm as tqdm


# Reading all columns as strings
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(master_df.columns)

import os
from tqdm.notebook import tqdm  # For Jupyter notebook
# OR
# from tqdm import tqdm_notebook as tqdm  # Alternative import

# Create directory if it doesn't exist
if not os.path.exists('chat_by_state'):
    os.makedirs('chat_by_state')

# Get unique states and convert to list
states = list(master_df.StateName.unique().compute())

# Create separate CSV for each state with progress bar
for state in tqdm(states, desc="Creating state-wise CSV files"):
    # Filter data for the state
    state_df = master_df[master_df.StateName == state]
    
    # Create filename - replace spaces with underscores and convert to lowercase
    filename = f"chat_by_state/{state.replace(' ', '_').lower()}.csv"
    
    # Save to CSV
    state_df.compute().to_csv(filename, index=False)

print(f"\nCompleted! All state files have been saved in 'chat_by_state' directory")


In [None]:
import pandas as pd

In [None]:
wb = pd.read_csv('chat_by_state/west_bengal.csv', low_memory=False)

In [None]:
wb.shape

In [None]:
wb.head(50)

In [None]:
wb_agri = wb[wb['Sector']=='AGRICULTURE']

In [None]:
wb_agri.head()

In [None]:
xx = wb_agri[wb_agri['Crop']=='0']

In [None]:
xx.shape

In [None]:
xx.head()