### Garbage collection

In [1]:
# Clear IPython's global namespace
%reset -f

# Reimport gc module
import gc
# Run garbage collection
gc.collect()

# Clear all cell outputs
from IPython.display import clear_output
clear_output(wait=True)

### Reading data

In [2]:
import os
os.chdir('/home/manimala/Documents/satyakama/paper-farmer-chatbot/')

In [3]:
import polars as pl 
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
pl.Config.set_tbl_cols(-1)  # Show all columns (-1 means no limit)
pl.Config.set_fmt_str_lengths(1000)  # Increase maximum string length

polars.config.Config

In [4]:
def preprocess_kcc_dataset(file_path: str) -> pl.DataFrame:
    """
    Preprocess the KCC dataset by cleaning and transforming the data.
    
    Args:
        file_path (str): Path to the KCC dataset CSV file
    
    Returns:
        pl.DataFrame: Cleaned and preprocessed DataFrame
    """
    
    # Read CSV with selected columns
    master_df = pl.read_csv(
        source=file_path,
        columns=[
            'Year', 'Month', 'Day', 'Crop', 'BlockName', 
            'DistrictName', 'QueryType', 'Season', 'Sector',
            'StateName', 'QueryText', 'KccAns'
        ],
        has_header=True,
        low_memory=True
    )

    print(f"Starting row count: {len(master_df)}")

    # First clean QueryText and KccAns
    master_df = master_df.with_columns([
        pl.col("QueryText").str.replace_all(r"\s+", " ").str.strip_chars().alias("QueryText"),
        pl.col("KccAns").str.replace_all(r"\s+", " ").str.strip_chars().alias("KccAns")
    ])

    # Then convert all to uppercase
    master_df = master_df.with_columns([
        pl.all().cast(pl.Utf8).str.to_uppercase()
    ])

    # Create Date column and drop date-related columns
    master_df = master_df.with_columns([
        pl.format("{}-{}-{}",
            pl.col("Day").cast(pl.Utf8).str.zfill(2),
            pl.col("Month").cast(pl.Utf8).str.zfill(2),
            pl.col("Year")
        ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
    ]).drop(['Day', 'Month', 'Year'])

    # Combine multiple filtering conditions
    initial_count = len(master_df)
    master_df = master_df.filter(
        # Basic text validations
        pl.col("QueryText").str.contains(r"[a-zA-Z0-9]") & 
        pl.col("KccAns").str.contains(r"[a-zA-Z0-9]") &
        # Remove specific QueryText values
        (pl.col("QueryType") != "WEATHER") &
        (pl.col("QueryText") != "TEST CALL") &
        (pl.col("QueryText") != "BLANK CALL") &
        ~pl.col("QueryText").str.contains("WEATHER") &
        # Remove whitespace-only entries
        ~pl.col("QueryText").str.contains(r"^\s*$") & 
        ~pl.col("KccAns").str.contains(r"^\s*$")
    )
    print(f"Rows removed after initial filtering: {initial_count - len(master_df)}")

    # Remove rows with digits in specific columns
    initial_count = len(master_df)
    for col in ['BlockName', 'Crop', 'QueryType', 'Sector']:
        master_df = master_df.filter(
            ~pl.col(col).str.contains(r"\d")
        )
    print(f"Rows removed after digit filtering: {initial_count - len(master_df)}")

    # Remove numeric-only entries
    initial_count = len(master_df)
    numeric_pattern = r"^[-]?[0-9]*\.?[0-9]+$"
    master_df = master_df.filter(
        ~pl.col("QueryText").str.contains(numeric_pattern) &
        ~pl.col("KccAns").str.contains(numeric_pattern)
    )
    print(f"Rows removed after numeric pattern filtering: {initial_count - len(master_df)}")

    # Handle Season column and null values
    master_df = master_df.with_columns([
        pl.when(pl.col("Season").is_null() | (pl.col("Season") == "0"))
        .then(pl.lit("UNSPECIFIED"))
        .otherwise(pl.col("Season"))
        .alias("Season")
    ]).drop_nulls()

    print(f"Final row count: {len(master_df)}")
    
    return master_df

# Usage example:
filtered_df = preprocess_kcc_dataset('dataset/original_dataset/kcc_dataset.csv')

Starting row count: 41987874
Rows removed after initial filtering: 21539071
Rows removed after digit filtering: 3267964
Rows removed after numeric pattern filtering: 395171
Final row count: 16785668


In [5]:
pandas_df = filtered_df.to_pandas()

In [6]:
pandas_df.head()

Unnamed: 0,BlockName,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,Date
0,MOHANPUR,COCONUT,SAMASTIPUR,FERTILIZER USE AND AVAILABILITY,KHARIF,HORTICULTURE,BIHAR,FERTILIZER DOSES OF COCONUT,FERTILIZER ARE NPK 1:2:2 KGPLANT,2007-01-05
1,DOLONGGHAT,BANANA,NAGAON,FERTILIZER USE AND AVAILABILITY,JAYAD,HORTICULTURE,ASSAM,ASKING ABOUT THE FERTILIZER SCHEDULE FOR BANAN...,SUGGESTED TO APPLY UREA242GRAMPLANTSSP206GRAMP...,2009-09-29
2,DANIYAWAN,WHEAT,PATNA,FERTILIZER USE AND AVAILABILITY,KHARIF,AGRICULTURE,BIHAR,ASKING ABOUT FERTILISER DOSE OF WHEAT,ASKING ABOUT FERTILISER DOSE OF WHEAT ARE 120K...,2009-12-23
3,AKHORIGOLA,CABBAGE,ROHTAS,CULTURAL PRACTICES,KHARIF,HORTICULTURE,BIHAR,EARLY CULTIVAR OF CABBAGE,PUSA DRUM HEAD,2009-02-22
4,HATHUA,GLADIOLUS,GOPALGANJ,CULTURAL PRACTICES,RABI,HORTICULTURE,BIHAR,METHOD OF GLADIOLUS CULTIVATION,ANSWER GIVEN IN DETAILS,2009-05-28


In [7]:
pandas_df.dtypes

BlockName               object
Crop                    object
DistrictName            object
QueryType               object
Season                  object
Sector                  object
StateName               object
QueryText               object
KccAns                  object
Date            datetime64[ms]
dtype: object

In [8]:
pandas_df.columns

Index(['BlockName', 'Crop', 'DistrictName', 'QueryType', 'Season', 'Sector',
       'StateName', 'QueryText', 'KccAns', 'Date'],
      dtype='object')

In [11]:
import pandas as pd

In [9]:
sample_df = pandas_df.head(10000)

In [23]:
# First analyze unique crop names
print("Sample of unique crop names in the data:")
print(pandas_df['Crop'].value_counts().head(20))

Sample of unique crop names in the data:
Crop
OTHERS                                  3885411
PADDY DHAN                              1855542
WHEAT                                   1227371
COTTON KAPAS                             829706
CHILLIES                                 468562
ONION                                    428789
BRINJAL                                  393518
TOMATO                                   354080
SUGARCANE NOBLE CANE                     351582
BENGAL GRAM GRAMCHICK PEAKABULICHANA     337553
MUSTARD                                  308006
GROUNDNUT PEA NUTMUNG PHALLI             301248
SOYBEAN BHAT                             299019
GREEN GRAM MOONG BEAN MOONG              280012
POTATO                                   262516
MANGO                                    249316
MAIZE MAKKA                              233462
BHINDIOKRALADYSFINGER                    226596
BLACK GRAM URD BEAN                      190922
BOVINECOWBUFFALO                         1

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Analyze common keywords in questions
def extract_common_patterns(df, column='QueryText', n_words=20):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df[column])
    words = vectorizer.get_feature_names_out()
    frequencies = X.sum(axis=0).A1
    
    # Get most common words
    top_indices = frequencies.argsort()[-n_words:][::-1]
    print(f"\nMost common words in {column}:")
    for idx in top_indices:
        print(f"{words[idx]}: {frequencies[idx]}")

# Sample analysis
sample_df = pandas_df.head(1000)  # Using a sample for initial analysis
extract_common_patterns(sample_df)


Most common words in QueryText:
cow: 287
wheat: 283
milk: 253
information: 247
regarding: 239
fertilizer: 205
dose: 181
improve: 179
crop: 124
problem: 101
deficiency: 98
asking: 91
control: 89
zinc: 78
low: 69
farmer: 55
asks: 51
manganese: 46
fertiliser: 45
management: 39


In [None]:
import torch
import networkx as nx
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from collections import defaultdict

# Initialize model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def get_embedding(text, model, tokenizer):
    text = text.lower()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def build_knowledge_graph(df, sample_size=None):
    """Build knowledge graph from dataframe"""
    if sample_size:
        df = df.sample(n=sample_size, random_state=42)
    
    G = nx.Graph()
    
    print("Building knowledge graph...")
    for idx, row in tqdm(df.iterrows()):
        # Get embeddings
        q_emb = get_embedding(row['QueryText'], model, tokenizer)
        a_emb = get_embedding(row['KccAns'], model, tokenizer)
        
        # Add nodes
        G.add_node(f"Q{idx}", 
                   type="question",
                   text=row['QueryText'],
                   embedding=q_emb,
                   metadata={
                       'crop': row['Crop'],
                       'season': row['Season'],
                       'state': row['StateName'],
                       'sector': row['Sector'],
                       'query_type': row['QueryType']
                   })
        
        G.add_node(f"A{idx}", 
                   type="answer",
                   text=row['KccAns'],
                   embedding=a_emb)
        
        G.add_edge(f"Q{idx}", f"A{idx}", type="has_answer")
    
    return G

def format_answer(answer_text):
    """Clean and format agricultural answers"""
    # Remove question repetition in answer
    answer = answer_text.split("ANSWER:-")[-1].strip()
    
    # Format measurements and units
    answer = answer.replace("LITER", "liters")
    answer = answer.replace("LIT", "liters")
    answer = answer.replace("GM", "grams")
    answer = answer.replace("ML", "ml")
    
    # Add missing units or context
    if 'SPRAY' in answer and 'water' not in answer.lower():
        answer += " in water"
    if 'SPRAY' in answer and 'per acre' not in answer.lower():
        answer += " per acre"
        
    return answer

def query_knowledge_base(query_text, G, crop=None, state=None, top_k=3):
    """Enhanced matching for direct answers and related questions"""
    query_emb = get_embedding(query_text, model, tokenizer)
    
    # Separate storage for different types of matches
    direct_matches = []
    same_crop_related = []    # Related questions from same crop
    same_state_related = []   # Related questions from same state
    same_season_related = []  # Related questions from same season
    
    for node in G.nodes():
        if G.nodes[node]['type'] == 'question':
            node_metadata = G.nodes[node]['metadata']
            question_text = G.nodes[node]['text']
            node_emb = G.nodes[node]['embedding']
            
            # Calculate semantic similarity
            similarity = np.dot(query_emb, node_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(node_emb))
            
            # Get answer
            answer_node = next((n for n in G.neighbors(node) if G.nodes[n]['type'] == 'answer'), None)
            if not answer_node:
                continue
                
            result = {
                'question': question_text,
                'answer': G.nodes[answer_node]['text'],
                'similarity': similarity,
                'metadata': node_metadata
            }
            
            # Direct matches (high similarity + crop match if specified)
            if similarity > 0.7 and (not crop or node_metadata['crop'] == crop):
                direct_matches.append(result)
                continue
                
            # Related questions categorization
            if 0.5 < similarity < 0.7:
                if crop and node_metadata['crop'] == crop:
                    same_crop_related.append(result)
                elif state and node_metadata['state'] == state:
                    same_state_related.append(result)
                elif node_metadata.get('season') == result['metadata'].get('season'):
                    same_season_related.append(result)
    
    # Sort all lists by similarity
    direct_matches.sort(key=lambda x: x['similarity'], reverse=True)
    same_crop_related.sort(key=lambda x: x['similarity'], reverse=True)
    same_state_related.sort(key=lambda x: x['similarity'], reverse=True)
    same_season_related.sort(key=lambda x: x['similarity'], reverse=True)
    
    # Combine related questions with priority
    related_matches = []
    for matches in [same_crop_related, same_state_related, same_season_related]:
        for match in matches:
            if len(related_matches) < top_k and match not in related_matches:
                related_matches.append(match)
    
    return {
        'direct_matches': direct_matches[:top_k],
        'related_matches': related_matches[:top_k]
    }


# Build graph with initial sample
sample_size = 1000  # Start with small sample for testing
G = build_knowledge_graph(pandas_df, sample_size=sample_size)

# Test function
def test_query(query, crop=None, state=None):
    print(f"\nQuery: {query}")
    if crop:
        print(f"Crop filter: {crop}")
    if state:
        print(f"State filter: {state}")
        
    results = query_knowledge_base(query, G, crop=crop, state=state)
    
    print("\nDirect Matches:")
    for i, match in enumerate(results['direct_matches'], 1):
        print(f"\nMatch {i} (Similarity: {match['similarity']:.3f})")
        print(f"Crop: {match['metadata']['crop']}")
        print(f"State: {match['metadata']['state']}")
        print(f"Question: {match['question']}")
        print(f"Answer: {match['answer']}")
    
    print("\nRelated Questions:")
    for i, match in enumerate(results['related_matches'], 1):
        print(f"\nRelated {i} (Similarity: {match['similarity']:.3f})")
        print(f"Crop: {match['metadata']['crop']}")
        print(f"State: {match['metadata']['state']}")
        print(f"Question: {match['question']}")
        print(f"Answer: {match['answer']}")

# Test with a new query
test_queries = [
    "how to control pests in paddy",
    "fertilizer schedule for paddy",
    "weed management in paddy"
]

for query in test_queries:
    test_query(query, crop="PADDY DHAN")

# Test example
test_query("how to control pests in paddy", crop="PADDY DHAN")

Using device: cuda
Building knowledge graph...


1000it [00:06, 164.94it/s]


Query: how to control pests in paddy
Crop filter: PADDY DHAN

Direct Matches:

Match 1 (Similarity: 0.965)
Crop: PADDY DHAN
State: HARYANA
Question: HOW TO CONTROL FUNJAL DISEASE IN PADDY
Answer: HOW TO CONTROL FUNJAL DISEASE IN PADDY SPRAY OF CARVENDAJIM BABASTIN500 grams IN 200 liters WATERACRE per acre

Match 2 (Similarity: 0.954)
Crop: PADDY DHAN
State: HARYANA
Question: HOW TO CONTROL WHITE YELLOW LEAF OF PADDY
Answer: SPRAY 2 -3 grams FARUS SALPHATE IN 1 liters WATER per acre

Match 3 (Similarity: 0.936)
Crop: PADDY DHAN
State: MADHYA PRADESH
Question: HOW TO CONTROL OF FUNGAL ATTACK IN PADDY CROP
Answer: RECOMMENDED FOR MANCOZEB 63 CARBENDAZIM 12 WP 40 GRAM AND STEPTROCYCLIN 2 GRAM AT 15 liters OF WATER

Related Questions:

Related 1 (Similarity: 0.665)
Crop: PADDY DHAN
State: CHHATTISGARH
Question: FARMER WANTS TO KNOW HOW TO CONTROL SUCKING PEST OF CROP
Answer: SPRAY IMEDACLORPID 5-7 ml AT 15 litersTER OF WATER per acre

Query: fertilizer schedule for paddy
Crop filter: PADDY


