### Garbage collection

In [36]:
# Clear IPython's global namespace
%reset -f

# Reimport gc module
import gc
# Run garbage collection
gc.collect()

# Clear all cell outputs
from IPython.display import clear_output
clear_output(wait=True)

### Reading data

In [37]:
import os
os.chdir('/home/manimala/Documents/satyakama/paper-farmer-chatbot/')

In [38]:
import polars as pl 
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
pl.Config.set_tbl_cols(-1)  # Show all columns (-1 means no limit)
pl.Config.set_fmt_str_lengths(1000)  # Increase maximum string length

polars.config.Config

In [39]:
master_df = pl.read_csv(
    source= 'dataset/original_dataset/kcc_dataset.csv',
    columns= ['Year',
        'Month',
        'Day',
        'Crop',
        'BlockName',
        'DistrictName',
        'QueryType',
        'Season',
        'Sector',
        'StateName',
        'QueryText',
        'KccAns'],
    has_header= True,
    low_memory= True

    
)

# Convert all column values to uppercase
master_df = master_df.with_columns([
    pl.all().cast(pl.Utf8).str.to_uppercase()
])

# FILTER LOGIC 
# Creating a new column for Date
master_df = master_df.with_columns(
    pl.format("{}-{}-{}", 
        pl.col("Day").cast(pl.Utf8).str.zfill(2),
        pl.col("Month").cast(pl.Utf8).str.zfill(2),
        pl.col("Year")
    ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
)
# Drop the 3 redundant columns
master_df = master_df.drop(['Day', 'Month', 'Year'])
original_rows = master_df.shape[0]
print(f"Original rows: {original_rows}")

master_df = master_df[['QueryText', 'KccAns', 'Date']]

# Clean text data by removing extra leading and trailing and in-between whitespaces between words
# master_df = master_df.with_columns([
#     # First standardize all whitespace to single spaces and remove leading/trailing
#     pl.col("QueryText").cast(pl.Utf8)
#         .str.replace(r'\s+', ' ')  # convert multiple spaces to single space
#         .str.replace(r'^\s+', '')  # remove leading spaces
#         .str.replace(r'\s+$', '')  # remove trailing spaces
#         .alias("QueryText"),
        
#     pl.col("KccAns").cast(pl.Utf8)
#         .str.replace(r'\s+', ' ')  # convert multiple spaces to single space
#         .str.replace(r'^\s+', '')  # remove leading spaces
#         .str.replace(r'\s+$', '')  # remove trailing spaces
#         .alias("KccAns")
# ])

master_df = master_df.with_columns([
    pl.col("QueryText").cast(pl.Utf8)
        .str.replace(r':\s+', ':')        # remove spaces after colon
        .str.replace(r'\s+', ' ')         # first convert all multiple spaces to single space
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # remove spaces between numbers, first pass
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # second pass for remaining number pairs
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # third pass for any remaining
        .str.replace(r'^\s+', '')         # remove leading spaces
        .str.replace(r'\s+$', '')         # remove trailing spaces
        .alias("QueryText"),
        
    pl.col("KccAns").cast(pl.Utf8)
        .str.replace(r':\s+', ':')        # remove spaces after colon
        .str.replace(r'\s+', ' ')         # first convert all multiple spaces to single space
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # remove spaces between numbers, first pass
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # second pass for remaining number pairs
        .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # third pass for any remaining
        .str.replace(r'^\s+', '')         # remove leading spaces
        .str.replace(r'\s+$', '')         # remove trailing spaces
        .alias("KccAns")
])

Original rows: 41987874


In [40]:
master_df.tail(10)

QueryText,KccAns,Date
str,str,date
"""ASKED ABOUT TO FERTILIZER DOSE IN MANGO PLANT""","""4-625200-400500-700 350-700""",2025-01-18
"""FARMER ASKED QUERY ON WEATHER""",""":: 28 C 24 C 24""",2025-01-18
"""ASKED ABOUT HOW LONG AFTER PESTICIDE SPRAYING CAN IT RAIN""",""":: 6-8""",2025-01-18
"""ASKING ABOUT LEAF MINER MANAGEMENT IN GROUNDNUT""",""":: : 12""",2025-01-19
"""ASKED ABOUT LEAF CATERPILLAR MANAGEMENT FOR GROUNDNUT""",""":: : 12""",2025-01-19
"""ASKED ABOUT LEAF SPOT CONTROL IN GROUNDNUT""",""":: 12""",2025-01-19
"""FARMER ASKED QUERY ON WEATHER""",""":: 324 -2726""",2025-01-18
"""ASKED ABOUT BUTTON SHEDDING MANAGEMENT IN COCONUT""",""":: 2001""",2025-01-18
"""FARMER ASKED QUERY ON WEATHER""",""":: 272427""",2025-01-18
"""FARMER ASKED QUERY ON WEATHER""",""":: 28 -2318""",2025-01-18


In [None]:
master_df = master_df.tail(2000)

master_df.shape

In [45]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest
import numpy as np
from tqdm.notebook import tqdm
import polars as pl

def clean_responses(df):
    print("Starting cleaning process...")
    
    # 1. Basic cleaning
    valid_answers = df.filter(
        (pl.col("KccAns").is_not_null() & pl.col("KccAns").str.contains(r'[a-zA-Z]')) &
        (pl.col("QueryText").is_not_null() & pl.col("QueryText").str.contains(r'[a-zA-Z]'))
    )
    print(f"Rows after basic cleaning: {valid_answers.height}")
    
    # 2. Initialize model
    print("Loading SentenceTransformer model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # 3. Get embeddings for both columns with tqdm
    print("Generating embeddings for KccAns...")
    kcc_answers = valid_answers['KccAns'].to_list()
    kcc_embeddings = []
    for text in tqdm(kcc_answers, desc="Encoding KccAns"):
        kcc_embeddings.append(model.encode(text))
    
    print("Generating embeddings for QueryText...")
    query_texts = valid_answers['QueryText'].to_list()
    query_embeddings = []
    for text in tqdm(query_texts, desc="Encoding QueryText"):
        query_embeddings.append(model.encode(text))
    
    # 4. Combine embeddings
    combined_embeddings = np.concatenate([kcc_embeddings, query_embeddings], axis=1)
    
    # 5. Use Isolation Forest to detect outliers
    print("Detecting outliers...")
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    predictions = iso_forest.fit_predict(combined_embeddings)
    
    # 6. Keep only good responses
    cleaned_df = valid_answers.filter(pl.Series(predictions == 1))
    
    return cleaned_df

# Apply the cleaning with progress tracking
print(f"Starting with {master_df.height} rows")
cleaned_df = clean_responses(master_df)
print(f"\nFinal Results:")
print(f"Original rows: {master_df.height}")
print(f"Cleaned rows: {cleaned_df.height}")
print(f"Removed {master_df.height - cleaned_df.height} rows")

# Look at some examples
print("\nExample cleaned responses:")
print("\nKccAns examples:")
print(cleaned_df['KccAns'].head(3))
print("\nQueryText examples:")
print(cleaned_df['QueryText'].head(3))

Starting with 41987874 rows
Starting cleaning process...
Rows after basic cleaning: 28721963
Loading SentenceTransformer model...
Generating embeddings for KccAns...


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [44]:
! pip install ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest
import numpy as np

def clean_responses(df):
    # 1. Basic cleaning
    valid_answers = df.filter(
        pl.col("KccAns").is_not_null() &
        pl.col("KccAns").str.contains(r'[a-zA-Z]')
    )
    
    # 2. Get embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(valid_answers['KccAns'].to_list())
    
    # 3. Use Isolation Forest to detect outliers
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    predictions = iso_forest.fit_predict(embeddings)
    
    # 4. Keep only good responses using filter instead of boolean indexing
    cleaned_df = valid_answers.filter(pl.Series(predictions == 1))
    
    return cleaned_df

# Apply the cleaning
cleaned_df = clean_responses(master_df)
print(f"Original rows: {master_df.height}")
print(f"Cleaned rows: {cleaned_df.height}")

# Look at some examples
print("\nExample cleaned responses:")
print(cleaned_df['KccAns'].head(5))

In [None]:
cleaned_df.tail(200)

In [None]:
[master_df['KccAns'].head(10)]

In [43]:
# Just get top 50 values and their counts
# Basic value counts with percentage
value_counts = (
    master_df.get_column("Season")
    .value_counts(parallel=True)
    .with_columns([
        (pl.col("count") / pl.col("count").sum() * 100).alias("percentage")  # Note: "count" not "counts"
    ])
    .sort("count", descending=True) 
    .head(50)
)

In [None]:
value_counts

In [None]:
filtered_df = master_df.filter(pl.col("Crop") == "Others")

filtered_df['QueryText', 'KccAns'].head(50)

In [None]:
master_df.columns