In [2]:
import pathlib as Path 
Path.Path.cwd()

PosixPath('/home/satyakama/Documents/paper-farmer-chatbot')

In [3]:
import polars as pl
import time

In [4]:
start = time.time()

try:
    # Read CSV with explicit schema
    df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',
        dtypes={
            'Year': pl.Int32,
            'Month': pl.Int32,
            'Day': pl.Int32,
            'Crop': pl.Utf8,
            'DistrictName': pl.Utf8,
            'QueryType': pl.Utf8,
            'Season': pl.Utf8,
            'Sector': pl.Utf8,
            'StateName': pl.Utf8,
            'QueryText': pl.Utf8,
            'KccAns': pl.Utf8,
            'Category': pl.Utf8,
            'BlockName': pl.Utf8
        },
        low_memory=True,
        infer_schema_length=10000,
        n_rows=1000  # Add this parameter to read only 1000 rows
    ).drop(['BlockName', 'Category'])


except Exception as e:
    print(f"Error: {e}")


  df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',


In [5]:
# Drop rows with missing values in QueryText or KccAns
df_clean = df.drop_nulls(subset=['QueryText', 'KccAns'])


In [6]:
df_clean.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns
i32,i32,i32,str,str,str,str,str,str,str,str
2006,1,17,"""1275""","""SAGAR""","""99""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control flower drop in …","""spray planofix4mlpump"""
2006,1,17,"""1279""","""SAGAR""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control fruit borer in …","""should be spray profenophos 35…"
2006,1,17,"""1064""","""SAGAR""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control of yellow moisa…","""should be spray metasystox 35m…"
2006,1,17,"""1279""","""DAMOH""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control white fly in br…","""should be spray metasystox 35m…"
2006,1,17,"""Wheat""","""DAMOH""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control termite in whea…","""use chlorpyrephos1lithactwith …"


In [47]:
df_clean['KccAns'][540]

'RECOMMENDED VARIETIES ARE MORDENDRSF-108DRSH-1'

In [14]:
import nltk
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
from nltk import word_tokenize, ne_chunk


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/satyakama/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/satyakama/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/satyakama/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/satyakama/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/satyakama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import ollama
import re

def parse_agricultural_recommendation(text):
    """Parse agricultural recommendation text to extract chemical and dosage information"""
    # Ask Mistral to help structure the information
    prompt = f"""
    Parse this agricultural recommendation and extract the chemical/medicine name and dosage:
    "{text}"
    Return only a Python dictionary with keys 'chemical/medicine' and 'dosage'. 
    If it's about varieties, use 'variety' as the key instead.
    """
    
    response = ollama.generate(
        model='mistral',
        prompt=prompt,
        # temperature=0.1  # Lower temperature for more consistent outputs
    )
    
    try:
        # Clean up the response to get just the dictionary part
        # The model should return something like "{'chemical/medicine': 'dimethoate', 'dosage': '2mL'}"
        response_text = response['response']
        # Use regex to extract the dictionary-like structure
        dict_match = re.search(r'\{.*\}', response_text)
        if dict_match:
            # Safely evaluate the dictionary string
            result = eval(dict_match.group())
            return result
        else:
            return {'chemical/medicine': None, 'dosage': None}
    except Exception as e:
        print(f"Error parsing response for text '{text}': {e}")
        return {'chemical/medicine': None, 'dosage': None}

# Define your list of input texts
inputs = [
    "RECOMMENDED TO SPRAY DIMETHOATE 2MLLIT",
    "RECOMMENDED VARIETIES ARE MORDENDRSF-108DRSH-1",
    "should be spray metasystox 35mL",
    "spray planofix4mlpump"
]

# Process each input
parsed_results = []
for text in inputs:
    result = parse_agricultural_recommendation(text)
    parsed_results.append(result)

# Print the results
for input_text, result in zip(inputs, parsed_results):
    print(f"\nInput: {input_text}")
    print(f"Parsed: {result}")


Input: RECOMMENDED TO SPRAY DIMETHOATE 2MLLIT
Parsed: {'chemical/medicine': None, 'dosage': None}

Input: RECOMMENDED VARIETIES ARE MORDENDRSF-108DRSH-1
Parsed: {'chemical/medicine': None, 'dosage': None}

Input: should be spray metasystox 35mL
Parsed: {'chemical/medicine': None, 'dosage': None}

Input: spray planofix4mlpump
Parsed: {'chemical/medicine': None, 'dosage': None}


In [17]:
from transformers import pipeline
import re
from typing import Dict, Optional, List

class ChemicalDosageExtractor:
    def __init__(self):
        """Initialize the NER pipeline and compile regex patterns"""
        # Load the NER pipeline
        self.ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english")
        
        # Compile regex patterns
        self.dosage_pattern = re.compile(r'(\d+(?:\.\d+)?)\s*(?:ML|MLLIT|MLPUMP|ML/LIT)', re.IGNORECASE)
        
        # Common chemical/pesticide names to check
        self.common_chemicals = {
            'DIMETHOATE', 'METASYSTOX', 'PLANOFIX'
        }

    def extract_from_text(self, text: str) -> Dict[str, Optional[str]]:
        """
        Extract chemical names and dosages from text
        
        Args:
            text: Input text containing chemical and dosage information
            
        Returns:
            Dictionary with chemical name and dosage
        """
        # First try to find common chemicals
        chemical = None
        for chem in self.common_chemicals:
            if chem in text.upper():
                chemical = chem.capitalize()
                break
        
        # If no common chemical found, try NER
        if not chemical:
            entities = self.ner(text)
            for entity in entities:
                if entity['entity'] in ['B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']:
                    chemical = entity['word']
                    break
        
        # Extract dosage using regex
        dosage_match = self.dosage_pattern.search(text)
        dosage = dosage_match.group(0) if dosage_match else None
        
        return {
            "chemical/medicine": chemical,
            "dosage": dosage
        }

    def process_texts(self, texts: List[str]) -> List[Dict[str, Optional[str]]]:
        """
        Process multiple texts
        
        Args:
            texts: List of input texts
            
        Returns:
            List of dictionaries containing extracted information
        """
        return [self.extract_from_text(text) for text in texts]

# Example usage
if __name__ == "__main__":
    # Initialize the extractor
    extractor = ChemicalDosageExtractor()
    
    # Example inputs
    inputs = [
        "RECOMMENDED TO SPRAY DIMETHOATE 2MLLIT",
        "RECOMMENDED VARIETIES ARE MORDENDRSF-108DRSH-1",
        "should be spray metasystox 35mL",
        "spray planofix4mlpump"
    ]
    
    # Process texts
    results = extractor.process_texts(inputs)
    
    # Print results
    for text, result in zip(inputs, results):
        print(f"\nInput: {text}")
        print(f"Extracted: {result}")

config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0



Input: RECOMMENDED TO SPRAY DIMETHOATE 2MLLIT
Extracted: {'chemical/medicine': 'Dimethoate', 'dosage': '2ML'}

Input: RECOMMENDED VARIETIES ARE MORDENDRSF-108DRSH-1
Extracted: {'chemical/medicine': None, 'dosage': None}

Input: should be spray metasystox 35mL
Extracted: {'chemical/medicine': 'Metasystox', 'dosage': '35mL'}

Input: spray planofix4mlpump
Extracted: {'chemical/medicine': 'Planofix', 'dosage': '4ml'}


In [None]:
# Print info about both dataframes
print("="*50)
print("\nOriginal DataFrame Info:")
print(f"Number of rows: {len(df):,}")
print(f"Memory usage: {df.estimated_size() / (1024**3):.2f} GB")

print("="*50)
print("\nUnimportant columns dropped DataFrame Info:")
print(f"Number of rows: {len(df_clean):,}")
print(f"Memory usage: {df_clean.estimated_size() / (1024**3):.2f} GB")
print(f"Rows removed: {len(df) - len(df_clean):,}")
print(f"Time taken: {time.time() - start:.2f} seconds")
print("Columns:", df_clean.columns)
print("="*50)

In [None]:
# Get value counts for KccAns
top_answers_KccAns = (
    df_clean
    .select(pl.col('KccAns'))
    .group_by('KccAns')
    .count()
    .sort('count', descending=True)
    .limit(15)
)

print("\nTop 15 most frequent answers in KccAns:")
print("=======================================")
total_rows = len(df_clean)
for row in top_answers_KccAns.iter_rows():
    answer, count = row
    percentage = (count / total_rows) * 100
    print(f"\nCount: {count:,} ({percentage:.2f}%)")
    print(f"Answer: {answer[:200]}...")

    # # Get value counts for QueryText
    # top_answers_QueryText = (
    #     df_clean
    #     .select(pl.col('QueryText'))
    #     .group_by('QueryText')
    #     .count()
    #     .sort('count', descending=True)
    #     .limit(10)
    # )

    # print("\nTop 10 most frequent answers in QueryText:")
    # print("=======================================")
    # total_rows = len(df_clean)
    # for row in top_answers_QueryText.iter_rows():
    #     answer, count = row
    #     percentage = (count / total_rows) * 100
    #     print(f"\nCount: {count:,} ({percentage:.2f}%)")
    #     print(f"Answer: {answer[:200]}...")

In [3]:
import dask.dataframe as dd

In [None]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')

print(f'Original number of rows in masters_df: {len(master_df.compute())}')

cleaned_df_completeKccAns = master_df.dropna(subset=['KccAns'])

print(f'Original number of rows in cleaned_df_completeKccAns: {len(cleaned_df_completeKccAns.compute())}')

# Drop all rows in which KccAns is NaN

In [None]:
# Filter out rows containing 'Call Disconnected'
cleaned_df_completeKccAns_dropCallDisconnected = cleaned_df_completeKccAns[
    ~(cleaned_df_completeKccAns['QueryText'].str.contains('Call Disconnected', case=False, na=False)) &
    ~(cleaned_df_completeKccAns['KccAns'].str.contains('Call Disconnected', case=False, na=False))
]

# Check the row counts
original_count = len(cleaned_df_completeKccAns.compute())
new_count = len(cleaned_df_completeKccAns_dropCallDisconnected.compute())

print(f'Number of rows before filtering: {original_count}')
print(f'Number of rows after filtering: {new_count}')
print(f'Number of rows removed: {original_count - new_count}')
print(f'Percentage of rows removed: {((original_count - new_count) / original_count * 100):.2f}%')

In [None]:
# Alternative approach
try:
    # First materialize the column as a series
    query_series = cleaned_df_completeKccAns_dropCallDisconnected['QueryText'].astype(str)  # ensure string type
    
    # Then get value counts
    top_queries = query_series.value_counts().nlargest(10).compute()
    
    print("\nTop 10 most frequent queries:")
    for query, count in top_queries.items():
        print(f"Count: {count}, Query: {query}")
except Exception as e:
    print(f"Error: {e}")

In [None]:
cleaned_df_completeKccAns.head()

In [None]:
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(f'Column names: {master_df.columns}')


# Calculate the percentage of NaN values
nan_percentage_kccAns = (master_df['KccAns'].isna().sum() / len(master_df) * 100).compute()

print(f'Percentage of NaN values in KccAns: {nan_percentage_kccAns:.2f}%')

In [12]:
# Drop column names
master_df = master_df.drop(columns=['BlockName', 'Category'])

In [None]:
# Count rows where any column has NaN
rows_with_nan = master_df.isna().any(axis=1).sum().compute()

# Get total number of rows
total_rows = len(master_df.compute())

# Calculate percentage
nan_percentage = (rows_with_nan / total_rows) * 100

print(f'Total number of rows: {total_rows}')
print(f'Number of rows with at least one NaN: {rows_with_nan}')
print(f'Percentage of rows with at least one NaN: {nan_percentage:.2f}%')

In [None]:
# Get NaN count for each column
column_nan_counts = master_df.isna().sum().compute()
column_nan_percentages = (column_nan_counts / total_rows * 100)

print("\nNaN distribution by column:")
for column in master_df.columns:
    count = column_nan_counts[column]
    percentage = column_nan_percentages[column]
    print(f'{column}: {count} NaN values ({percentage:.2f}%)')

In [None]:
master_df.head(25)

In [None]:
import dask
import dask.dataframe as dd

import os
import tqdm as tqdm


# Reading all columns as strings
master_df = dd.read_csv('kcc_dataset.csv', dtype='object') 

print(master_df.columns)

import os
from tqdm.notebook import tqdm  # For Jupyter notebook
# OR
# from tqdm import tqdm_notebook as tqdm  # Alternative import

# Create directory if it doesn't exist
if not os.path.exists('chat_by_state'):
    os.makedirs('chat_by_state')

# Get unique states and convert to list
states = list(master_df.StateName.unique().compute())

# Create separate CSV for each state with progress bar
for state in tqdm(states, desc="Creating state-wise CSV files"):
    # Filter data for the state
    state_df = master_df[master_df.StateName == state]
    
    # Create filename - replace spaces with underscores and convert to lowercase
    filename = f"chat_by_state/{state.replace(' ', '_').lower()}.csv"
    
    # Save to CSV
    state_df.compute().to_csv(filename, index=False)

print(f"\nCompleted! All state files have been saved in 'chat_by_state' directory")


In [17]:
import pandas as pd

In [18]:
wb = pd.read_csv('chat_by_state/west_bengal.csv', low_memory=False)

In [None]:
wb.shape

In [None]:
wb.head(50)

In [None]:
wb_agri = wb[wb['Sector']=='AGRICULTURE']

In [None]:
wb_agri.head()

In [20]:
xx = wb_agri[wb_agri['Crop']=='0']

In [None]:
xx.shape

In [None]:
xx.head()