In [3]:
# ! pip install "dask[dataframe]"
# ! pip install "dask[diagnostics]"

In [1]:
import pathlib as Path 
Path.Path.cwd()

PosixPath('/home/satyakama/Documents/paper-farmer-chatbot')

In [3]:
import dask.dataframe as dd

In [4]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')

print(f'Original number of rows in masters_df: {len(master_df.compute())}')

cleaned_df_completeKccAns = master_df.dropna(subset=['KccAns'])

print(f'Original number of rows in cleaned_df_completeKccAns: {len(cleaned_df_completeKccAns.compute())}')

# Drop all rows in which KccAns is NaN

Column names: Index(['BlockName', 'Category', 'Year', 'Month', 'Day', 'Crop', 'DistrictName',
       'QueryType', 'Season', 'Sector', 'StateName', 'QueryText', 'KccAns'],
      dtype='object')
Original number of rows in masters_df: 41987874
Original number of rows in cleaned_df_completeKccAns: 37667462


In [5]:
# Filter out rows containing 'Call Disconnected'
cleaned_df_completeKccAns_dropCallDisconnected = cleaned_df_completeKccAns[
    ~(cleaned_df_completeKccAns['QueryText'].str.contains('Call Disconnected', case=False, na=False)) &
    ~(cleaned_df_completeKccAns['KccAns'].str.contains('Call Disconnected', case=False, na=False))
]

# Check the row counts
original_count = len(cleaned_df_completeKccAns.compute())
new_count = len(cleaned_df_completeKccAns_dropCallDisconnected.compute())

print(f'Number of rows before filtering: {original_count}')
print(f'Number of rows after filtering: {new_count}')
print(f'Number of rows removed: {original_count - new_count}')
print(f'Percentage of rows removed: {((original_count - new_count) / original_count * 100):.2f}%')

Number of rows before filtering: 37667462
Number of rows after filtering: 37636895
Number of rows removed: 30567
Percentage of rows removed: 0.08%


In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = cleaned_df_completeKccAns_dropCallDisconnected['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
cleaned_df_completeKccAns.head()

In [None]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')


# Calculate the percentage of NaN values
nan_percentage_kccAns = (master_df['KccAns'].isna().sum() / len(master_df) * 100).compute()

print(f'Percentage of NaN values in KccAns: {nan_percentage_kccAns:.2f}%')

In [12]:
# Drop column names
master_df = master_df.drop(columns=['BlockName', 'Category'])

In [None]:
# Count rows where any column has NaN
rows_with_nan = master_df.isna().any(axis=1).sum().compute()

# Get total number of rows
total_rows = len(master_df.compute())

# Calculate percentage
nan_percentage = (rows_with_nan / total_rows) * 100

print(f'Total number of rows: {total_rows}')
print(f'Number of rows with at least one NaN: {rows_with_nan}')
print(f'Percentage of rows with at least one NaN: {nan_percentage:.2f}%')

In [None]:
# Get NaN count for each column
column_nan_counts = master_df.isna().sum().compute()
column_nan_percentages = (column_nan_counts / total_rows * 100)

print("\nNaN distribution by column:")
for column in master_df.columns:
    count = column_nan_counts[column]
    percentage = column_nan_percentages[column]
    print(f'{column}: {count} NaN values ({percentage:.2f}%)')

In [None]:
master_df.head(25)

In [None]:
import dask
import dask.dataframe as dd

import os
import tqdm as tqdm


# Reading all columns as strings
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(master_df.columns)

import os
from tqdm.notebook import tqdm  # For Jupyter notebook
# OR
# from tqdm import tqdm_notebook as tqdm  # Alternative import

# Create directory if it doesn't exist
if not os.path.exists('chat_by_state'):
    os.makedirs('chat_by_state')

# Get unique states and convert to list
states = list(master_df.StateName.unique().compute())

# Create separate CSV for each state with progress bar
for state in tqdm(states, desc="Creating state-wise CSV files"):
    # Filter data for the state
    state_df = master_df[master_df.StateName == state]
    
    # Create filename - replace spaces with underscores and convert to lowercase
    filename = f"chat_by_state/{state.replace(' ', '_').lower()}.csv"
    
    # Save to CSV
    state_df.compute().to_csv(filename, index=False)

print(f"\nCompleted! All state files have been saved in 'chat_by_state' directory")


In [17]:
import pandas as pd

In [18]:
wb = pd.read_csv('chat_by_state/west_bengal.csv', low_memory=False)

In [None]:
wb.shape

In [None]:
wb.head(50)

In [None]:
wb_agri = wb[wb['Sector']=='AGRICULTURE']

In [None]:
wb_agri.head()

In [20]:
xx = wb_agri[wb_agri['Crop']=='0']

In [None]:
xx.shape

In [None]:
xx.head()