In [None]:
# ! pip install "dask[dataframe]"
# ! pip install "dask[diagnostics]"

In [2]:
import pathlib as Path 
print(f'Current working directory: {Path.Path.cwd()}')

import dask.dataframe as dd # type: ignore

Current working directory: /home/manimala/Documents/satyakama/paper-farmer-chatbot


In [3]:
import numpy as np

dtypes = {
    'Year': np.int64,
    'Month': np.int64,
    'Day': np.int64,
    'Crop': 'string[pyarrow]',
    'DistrictName': 'string[pyarrow]',
    'QueryType': 'string[pyarrow]',
    'Season': 'string[pyarrow]',
    'Sector': 'string[pyarrow]',
    'StateName': 'string[pyarrow]',
    'QueryText': 'string[pyarrow]',
    'KccAns': 'string[pyarrow]',
    'BlockName': 'string[pyarrow]',  # Added this
    'Category': 'string[pyarrow]'    # Added this
}

master_df = dd.read_csv(
    'dataset/original_dataset/kcc_dataset.csv',
    dtype=dtypes,
    blocksize='128MB',
    usecols=lambda col: col not in ['BlockName', 'Category']
)

# Now get the row count
row_count_master_df = master_df.shape[0].compute()

print(f'Original number of rows in master_df: {row_count_master_df}')

Original number of rows in master_df: 41987874


In [4]:
master_df_completeQApairs = master_df.dropna(subset=['QueryText', 'KccAns'])

print(f'Original number of rows in master_df_completeQApairs: {master_df_completeQApairs.shape[0].compute()}')

Original number of rows in master_df_completeQApairs: 37665904


In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = master_df_completeQApairs['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(15).compute()
    
    print("\nTop 15 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")


Top 10 most frequent queries:
Count: 4698473, Query: Farmer asked query on Weather
Count: 832804, Query: TELL ME ABOUT WEATHER INFORMATION 
Count: 375914, Query: Asking about weather forecast
Count: 213078, Query: weather report
Count: 199469, Query: weather information
Count: 193288, Query: Asking about weather forecast 
Count: 182086, Query: WEATHER REPORT
Count: 178518, Query: TELL ME WEATHER INFORMATION
Count: 177374, Query: WEATHER INFORMATION
Count: 176098, Query: Weather information
Count: 157399, Query: information regarding weather forecasting
Count: 136110, Query: weather information 
Count: 127310, Query: weather
Count: 99458, Query: Asked About SMS Activation
Count: 97832, Query: Information regarding weather in 


In [6]:
# Get the list of top 10 queries as a set for faster lookup
top_query_set = set(top_queries.index)

# Create a filter to exclude top queries
master_df_removeWeather = master_df_completeQApairs[~master_df_completeQApairs['QueryText'].isin(top_query_set)]

In [7]:
# Check the new size
print(f'Original number of rows: {master_df.shape[0].compute():,}')
print(f'Original number of rows in master_df_completeQApairs: {master_df_completeQApairs.shape[0].compute()}')
print(f'Rows after removing top queries: {master_df_removeWeather.shape[0].compute():,}')
# print(f'Rows removed: {(master_df_completeQApairs.shape[0].compute() - filtered_df.shape[0].compute()):,}')

Original number of rows: 41,987,874
Original number of rows in master_df_completeQApairs: 37665904
Rows after removing top queries: 29,820,693


In [12]:
import gc

# Delete previous large dataframes if they exist
if 'master_df' in locals():
    del master_df
if 'master_df_completeQApairs' in locals():
    del master_df_completeQApairs
if 'query_series' in locals():
    del query_series
if 'top_queries' in locals():
    del top_queries

# Force garbage collection
gc.collect()

857

In [13]:
try:
    # Set a smaller number of partitions to reduce memory overhead
    master_df_removeWeather = master_df_removeWeather.repartition(npartitions=100)
    
    # Process in chunks using map_partitions
    top_queries_KccAns = (
        master_df_removeWeather['KccAns']
        .map_partitions(lambda x: x.value_counts())
        .compute()
        .nlargest(10)
    )
    
    print("\nTop 10 most frequent answers in KccAns:")
    for answer, count in top_queries_KccAns.items():
        print(f"Count: {count:,}, Answer: {answer}")

except Exception as e:
    print(f"Error: {e}")

Error: Unable to allocate 78.5 TiB for an array with shape (10289501,) and data type <U2097814


In [10]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series_KccAns = master_df_removeWeather['KccAns'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries_KccAns = query_series_KccAns.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries in KccAns:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

Error: Unable to allocate 3.53 TiB for an array with shape (462138,) and data type <U2097814


In [8]:
master_df_removeWeather.columns

Index(['Year', 'Month', 'Day', 'Crop', 'DistrictName', 'QueryType', 'Season',
       'Sector', 'StateName', 'QueryText', 'KccAns'],
      dtype='object')

In [None]:
set(top_queries.index)

In [None]:
# Filter out rows containing 'Call Disconnected'
cleaned_df_completeQApairs_dropCallDisconnected = cleaned_df_completeQApairs[
    ~(cleaned_df_completeQApairs['QueryText'].str.contains('Call Disconnected', case=False, na=False)) &
    ~(cleaned_df_completeQApairs['KccAns'].str.contains('Call Disconnected', case=False, na=False))
]

# Check the row counts
original_count = len(cleaned_df_completeKccAns.compute())
new_count = len(cleaned_df_completeKccAns_dropCallDisconnected.compute())

print(f'Number of rows before filtering: {original_count}')
print(f'Number of rows after filtering: {new_count}')
print(f'Number of rows removed: {original_count - new_count}')
print(f'Percentage of rows removed: {((original_count - new_count) / original_count * 100):.2f}%')

In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = cleaned_df_completeKccAns_dropCallDisconnected['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
cleaned_df_completeKccAns.head()

In [None]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')


# Calculate the percentage of NaN values
nan_percentage_kccAns = (master_df['KccAns'].isna().sum() / len(master_df) * 100).compute()

print(f'Percentage of NaN values in KccAns: {nan_percentage_kccAns:.2f}%')

In [None]:
# Drop column names
master_df = master_df.drop(columns=['BlockName', 'Category'])

In [None]:
# Count rows where any column has NaN
rows_with_nan = master_df.isna().any(axis=1).sum().compute()

# Get total number of rows
total_rows = len(master_df.compute())

# Calculate percentage
nan_percentage = (rows_with_nan / total_rows) * 100

print(f'Total number of rows: {total_rows}')
print(f'Number of rows with at least one NaN: {rows_with_nan}')
print(f'Percentage of rows with at least one NaN: {nan_percentage:.2f}%')

In [None]:
# Get NaN count for each column
column_nan_counts = master_df.isna().sum().compute()
column_nan_percentages = (column_nan_counts / total_rows * 100)

print("\nNaN distribution by column:")
for column in master_df.columns:
    count = column_nan_counts[column]
    percentage = column_nan_percentages[column]
    print(f'{column}: {count} NaN values ({percentage:.2f}%)')

In [None]:
master_df.head(25)

In [None]:
import dask
import dask.dataframe as dd

import os
import tqdm as tqdm


# Reading all columns as strings
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(master_df.columns)

import os
from tqdm.notebook import tqdm  # For Jupyter notebook
# OR
# from tqdm import tqdm_notebook as tqdm  # Alternative import

# Create directory if it doesn't exist
if not os.path.exists('chat_by_state'):
    os.makedirs('chat_by_state')

# Get unique states and convert to list
states = list(master_df.StateName.unique().compute())

# Create separate CSV for each state with progress bar
for state in tqdm(states, desc="Creating state-wise CSV files"):
    # Filter data for the state
    state_df = master_df[master_df.StateName == state]
    
    # Create filename - replace spaces with underscores and convert to lowercase
    filename = f"chat_by_state/{state.replace(' ', '_').lower()}.csv"
    
    # Save to CSV
    state_df.compute().to_csv(filename, index=False)

print(f"\nCompleted! All state files have been saved in 'chat_by_state' directory")


In [None]:
import pandas as pd

In [None]:
wb = pd.read_csv('chat_by_state/west_bengal.csv', low_memory=False)

In [None]:
wb.shape

In [None]:
wb.head(50)

In [None]:
wb_agri = wb[wb['Sector']=='AGRICULTURE']

In [None]:
wb_agri.head()

In [None]:
xx = wb_agri[wb_agri['Crop']=='0']

In [None]:
xx.shape

In [None]:
xx.head()