### Garbage collection

In [1]:
# Clear IPython's global namespace
%reset -f

# Reimport gc module
import gc
# Run garbage collection
gc.collect()

# Clear all cell outputs
from IPython.display import clear_output
clear_output(wait=True)

### Reading data

In [2]:
import os
os.chdir('/home/manimala/Documents/satyakama/paper-farmer-chatbot/')

In [3]:
import polars as pl 
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
pl.Config.set_tbl_cols(-1)  # Show all columns (-1 means no limit)
pl.Config.set_fmt_str_lengths(1000)  # Increase maximum string length

polars.config.Config

In [46]:
master_df = pl.read_csv(
    source= 'dataset/original_dataset/kcc_dataset.csv',
    columns= ['Year',
        'Month',
        'Day',
        'Crop',
        'BlockName',
        'DistrictName',
        'QueryType',
        'Season',
        'Sector',
        'StateName',
        'QueryText',
        'KccAns'],
    has_header= True,
    low_memory= True

    
)

# Convert all column values to uppercase
master_df = master_df.with_columns([
    pl.all().cast(pl.Utf8).str.to_uppercase()
])

# FILTER LOGIC 
# Creating a new column for Date
master_df = master_df.with_columns(
    pl.format("{}-{}-{}", 
        pl.col("Day").cast(pl.Utf8).str.zfill(2),
        pl.col("Month").cast(pl.Utf8).str.zfill(2),
        pl.col("Year")
    ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
)
# Drop the 3 redundant columns
master_df = master_df.drop(['Day', 'Month', 'Year'])
original_rows = master_df.shape[0]
print(f"Original rows: {original_rows}")

# Drop rows where QueryType is 'WEATHER'
master_df = master_df.filter(pl.col("QueryType") != "WEATHER")

# Drop rows where QueryType is numeric
master_df = master_df.filter(~pl.col("QueryType").str.contains(r"^\d+$"))

# Drop rows where BlockName is numeric
master_df = master_df.filter(
    ~(pl.col("BlockName").str.contains(r"^\d+$")) & 
    (pl.col("BlockName") != "0") &
    (pl.col("BlockName") != " 0 ")
)

# Drop rows where Crop is numeric
master_df = master_df.filter(~pl.col("Crop").str.contains(r"^\d+$"))

# Drop rows where QueryText is numeric
master_df = master_df.filter(~pl.col("QueryText").str.contains(r"^\d+$"))

# Drop rows where KccAns is numeric
master_df = master_df.filter(~pl.col("KccAns").str.contains(r"^\d+$"))

# Drop rows where Sector is numeric
master_df = master_df.filter(~pl.col("Sector").str.contains(r"^\d+$"))

# Replace null or '0' values in Season column with 'UNSPECIFIED'
master_df = master_df.with_columns([
    pl.when(
        (pl.col("Season").is_null()) | 
        (pl.col("Season") == "0") | 
        (pl.col("Season") == "")
    ).then(
        pl.lit("UNSPECIFIED")
    ).otherwise(pl.col("Season")).alias("Season")
])

# Print results
print(f"Original rows: {original_rows}")
print(f"Rows after cleaning: {master_df.shape[0]}")
print(f"Rows removed: {original_rows - master_df.shape[0]}")




# Optionally, verify unique QueryTypes that remain
# print("\nUnique QueryTypes after cleaning:")
# print(master_df.select('QueryType').unique().sort('QueryType'))

# master_df = master_df[['QueryText', 'KccAns', 'Date']]

# Clean text data by removing extra leading and trailing and in-between whitespaces between words
# master_df = master_df.with_columns([
#     # First standardize all whitespace to single spaces and remove leading/trailing
#     pl.col("QueryText").cast(pl.Utf8)
#         .str.replace(r'\s+', ' ')  # convert multiple spaces to single space
#         .str.replace(r'^\s+', '')  # remove leading spaces
#         .str.replace(r'\s+$', '')  # remove trailing spaces
#         .alias("QueryText"),
        
#     pl.col("KccAns").cast(pl.Utf8)
#         .str.replace(r'\s+', ' ')  # convert multiple spaces to single space
#         .str.replace(r'^\s+', '')  # remove leading spaces
#         .str.replace(r'\s+$', '')  # remove trailing spaces
#         .alias("KccAns")
# ])

# master_df = master_df.with_columns([
#     pl.col("QueryText").cast(pl.Utf8)
#         .str.replace(r':\s+', ':')        # remove spaces after colon
#         .str.replace(r'\s+', ' ')         # first convert all multiple spaces to single space
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # remove spaces between numbers, first pass
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # second pass for remaining number pairs
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # third pass for any remaining
#         .str.replace(r'^\s+', '')         # remove leading spaces
#         .str.replace(r'\s+$', '')         # remove trailing spaces
#         .alias("QueryText"),
        
#     pl.col("KccAns").cast(pl.Utf8)
#         .str.replace(r':\s+', ':')        # remove spaces after colon
#         .str.replace(r'\s+', ' ')         # first convert all multiple spaces to single space
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # remove spaces between numbers, first pass
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # second pass for remaining number pairs
#         .str.replace(r'(\d+)\s+(\d+)', '$1$2')  # third pass for any remaining
#         .str.replace(r'^\s+', '')         # remove leading spaces
#         .str.replace(r'\s+$', '')         # remove trailing spaces
#         .alias("KccAns")
# ])

Original rows: 41987874
Original rows: 41987874
Rows after cleaning: 20246038
Rows removed: 21741836


In [48]:
master_df.head()

BlockName,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,Date
str,str,str,str,str,str,str,str,str,date
"""0 ""","""BOVINECOWBUFFALO""","""INDORE""","""DAIRY PRODUCTION""","""RABI""","""ANIMAL HUSBANDRY""","""MADHYA PRADESH""","""HOW TO INCREASE MILK PRODUCTION OF COW""","""YOU CAN GIVE GYLOX POWDER 100 GRAM IN A DAY FOR INCREASE MILK PRODUCTION OF COW""",2006-01-17
"""MOHANPUR""","""COCONUT""","""SAMASTIPUR""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""HORTICULTURE""","""BIHAR""","""FERTILIZER DOSES OF COCONUT""","""FERTILIZER ARE NPK 1:2:2 KGPLANT """,2007-01-05
"""0 ""","""ONION""","""AHMADNAGAR""","""AGRICULTURE MECHANIZATION""","""UNSPECIFIED""","""HORTICULTURE""","""MAHARASHTRA""","""BLIGHT ON ONION""","""COPPER OXYCLORIDE25ML10LIT OF WATER""",2007-01-01
"""0 ""","""ONION""","""AHMADNAGAR""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""HORTICULTURE""","""MAHARASHTRA""","""ASK FERTLIZERS DOSE OF ONION ""","""FERTLIZERS DOSE OF ONION 15:15:15 135 KG UREA 45 KG ACRE""",2007-01-13
"""0 ""","""ONION""","""AHMADNAGAR""","""FERTILIZER USE AND AVAILABILITY""","""UNSPECIFIED""","""HORTICULTURE""","""MAHARASHTRA""","""FERTILIZER DOSE FOR ONION""","""APPLY 135KG SUPHALA45KG UJJWALA PER ACRE""",2007-01-13


In [49]:
# # Function to analyze digit-only values in each column
# def analyze_digit_values(df):
#     results = []
    
#     for col in df.columns:
#         # Skip Date column as it's not a string
#         if col == 'Date':
#             continue
            
#         # Count rows where the column value contains only digits
#         digit_count = df.filter(
#             pl.col(col).str.contains(r"^\d+$")
#         ).height
        
#         # Calculate percentage
#         total_rows = df.height
#         percentage = (digit_count / total_rows) * 100 if total_rows > 0 else 0
        
#         if digit_count > 0:
#             results.append({
#                 'column': col,
#                 'digit_only_count': digit_count,
#                 'total_rows': total_rows,
#                 'percentage': percentage
#             })
    
#     return results

# # Analyze the dataframe
# analysis_results = analyze_digit_values(master_df)

# # Print results
# print("Columns containing digit-only values:")
# print("-" * 80)
# print(f"{'Column':<20} {'Digit-Only Count':<20} {'Total Rows':<15} {'Percentage':>10}")
# print("-" * 80)
# for result in analysis_results:
#     print(f"{result['column']:<20} {result['digit_only_count']:<20} {result['total_rows']:<15} {result['percentage']:>10.2f}%")

# # Optionally, show some examples from columns that have digit-only values
# if analysis_results:
#     print("\nExample rows with digit-only values:")
#     for result in analysis_results:
#         col = result['column']
#         print(f"\nExamples from {col}:")
#         examples = master_df.filter(
#             pl.col(col).str.contains(r"^\d+$")
#         ).select(col).head(5)
#         print(examples)

In [50]:
value_counts = (
    master_df.get_column("BlockName")
    .value_counts(parallel=True)
    .with_columns([
        (pl.col("count") / pl.col("count").sum() * 100).alias("percentage")  # Note: "count" not "counts"
    ])
    .sort("count", descending=True) 
    .head(100)
)

print(value_counts)

shape: (100, 3)
┌──────────────────┬────────┬────────────┐
│ BlockName        ┆ count  ┆ percentage │
│ ---              ┆ ---    ┆ ---        │
│ str              ┆ u32    ┆ f64        │
╞══════════════════╪════════╪════════════╡
│ 0                ┆ 281176 ┆ 1.388795   │
│ HANUMANGARH      ┆ 73966  ┆ 0.365336   │
│ RAJGARH          ┆ 71790  ┆ 0.354588   │
│ DELHI            ┆ 67955  ┆ 0.335646   │
│ NOHAR            ┆ 55065  ┆ 0.271979   │
│ BHADRA           ┆ 51079  ┆ 0.252291   │
│ JAISALMER        ┆ 38858  ┆ 0.191929   │
│ SARDARSHAHAR     ┆ 33382  ┆ 0.164882   │
│ PIRAWA SUNEL     ┆ 33096  ┆ 0.163469   │
│ JHALRAPATAN      ┆ 32886  ┆ 0.162432   │
│ BARMER           ┆ 32772  ┆ 0.161869   │
│ KALYANPUR        ┆ 32310  ┆ 0.159587   │
│ TARANAGAR        ┆ 31777  ┆ 0.156954   │
│ OSIAN            ┆ 31103  ┆ 0.153625   │
│ BATHINDA         ┆ 30142  ┆ 0.148879   │
│ NAGAUR           ┆ 29950  ┆ 0.14793    │
│ HISAR-I          ┆ 29526  ┆ 0.145836   │
│ SAGAR            ┆ 29208  ┆ 0.144265

In [51]:
value_counts[0][0]

BlockName,count,percentage
str,u32,f64
"""0 """,281176,1.388795


In [15]:
master_df.tail(50)

BlockName,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,Date
str,str,str,str,str,str,str,str,str,date
"""VIKARABAD""","""OTHERS""","""VIKARABAD""","""GOVERNMENT SCHEMES""",,"""AGRICULTURE""","""TELANGANA""","""RAITHU BAROSA SCHEME ""","""CONTACT TO THE MAO """,2025-01-18
"""VIKARABAD""","""MAIZE MAKKA""","""VIKARABAD""","""PLANT PROTECTION""",,"""AGRICULTURE""","""TELANGANA""","""QUERY ON JASSIDS AND CATERPILLAR MANAGEMENT IN MAIZE""","""RECOMMENDED TO SPRAY EMAMECTIN BENZOATE 15 FIPRONIL 35 SCAPEX-50 200 ML 200 LITERS OF WATER ACRE """,2025-01-19
"""VIKARABAD""","""OTHERS""","""VIKARABAD""","""GOVERNMENT SCHEMES""",,"""AGRICULTURE""","""TELANGANA""","""FARMER ASKED QUERY ABOUT PM KISAN SAMMAN NIDHI STATUS""","""REASON OF INELIGIBLITY :UNTRACEABLE BENEFICIARYPAYMENT STATUS :WAITING FOR APPROVAL BY STATE""",2025-01-19
"""BANTWARAM""","""OIL PALM""","""VIKARABAD""","""GOVERNMENT SCHEMES""",,"""HORTICULTURE""","""TELANGANA""","""OIL PALM SCHEME DETAILS ""","""MASATTARVIKARABAD DHSO8977714194 HORTICULTURE OFFICERVIKARABAD 8977714463 """,2025-01-19
"""BANTWARAM""","""OTHERS""","""VIKARABAD""","""POWER ROADS ETC""",,"""AGRICULTURE""","""TELANGANA""","""VIKARABAD DISTRICT GROUND WATER OFFICER CONTACT NUMBER ""","""DISTRICT GROUND WATER OFFICERVIKARABAD7032982021DGWOVKBGMAILCOMOO THE DISTRICT GROUND WATER OFFICER GROUND WATER DEPARTMENT ROOM NOS16 II FLOOR INTEGRATED DISTRICT OFFICE COMPLEX ENNEPALLY PARGI ROAD VIKARABAD DISTRICT VIKARABAD-501101""",2025-01-19
"""SRIRANGARAJAPURAM""","""OTHERS""","""WANAPARTHY""","""GOVERNMENT SCHEMES""",,"""AGRICULTURE""","""TELANGANA""","""PM KISAN NIDHI INFORMATION -""","""YOU HAVE NOT COMPLETED EKYC PLEASE COMPLETE THE EKYC PROCESS""",2025-01-19
"""SRIRANGARAJAPURAM""","""OTHERS""","""WANAPARTHY""","""GOVERNMENT SCHEMES""",,"""AGRICULTURE""","""TELANGANA""","""PM KISAN SAMMAN NIDHI INFORMATION -""","""YOU HAVE NOT COMPLETED EKYC PLEASE COMPLETE THE EKYC PROCESS""",2025-01-19
"""REVALLY""","""MANGO""","""WANAPARTHY""","""PLANT PROTECTION""",,"""HORTICULTURE""","""TELANGANA""","""WHITE FLY CONTROL IN MANGO""","""RECOMMENDED TO SPRAY FIPRONIL REGENT 400 ML 200 LITRES OF WATER ACRE 400 200 """,2025-01-19
"""WARDHANNA PET""","""PADDY DHAN""","""WARANGAL RURAL""","""WEED MANAGEMENT""",,"""AGRICULTURE""","""TELANGANA""","""FARMER ASKED QUERY ON POST EMERGENCE WEED MANAGEMENT IN PADDY""",""" -100 200 USES OF PYRAZOSULFURON-ETHYL 70 WDG: IS A BROAD SPECTRUM HERBICIDE FOR PADDY WHICH EFFECTIVELY CONTROLS MAJOR BROADLEAF WEEDS SEDGES AND GRASSY WEEDS IT IS A PRE AND EARLY POSTEMERGENCE HERBICIDE WHICH GIVES THE LONGER DURATION OF CONTROL BY INHIBITING SHOOT GROWTH AND KILLING THE WEEDS""",2025-01-18
"""SHAYAMPET""","""BANANA""","""WARANGAL RURAL""","""PLANT PROTECTION""",,"""HORTICULTURE""","""TELANGANA""","""FARMER ASKED QUERY ON SUCKING PEST MANAGEMENT IN BANANA""","""RECOMMENDED TO SPRAY IMIDACLOPRID CONFIDOR 60 ML IN 200 LITERS OF WATER 1 ACRE 60 200 ORRECOMMENDED TO SPRAY ACEPHATE ARTHIN 300 GRAMS 200 LITRES OF WATER ACRE 300 200 """,2025-01-18


In [None]:
from transformers import pipeline
import polars as pl
from tqdm import tqdm
import torch

def clean_responses_with_zeroshot(df):
    print("Starting zero-shot classification cleaning process...")
    
    # Initialize the classifier with GPU
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using {'GPU' if device == 0 else 'CPU'} for processing...")
    
    print("Loading BART model...")
    classifier = pipeline("zero-shot-classification", 
                        model="facebook/bart-large-mnli", 
                        device=device,
                        batch_size=128)
    
    # Define labels
    candidate_labels = ['agriculture related information', 'non agriculture related information']
    
    # Lists to store non-agriculture related texts and their confidence scores
    non_agri_answers = []
    
    def process_batch(texts, confidence_threshold=0.5):
        # Filter out None or empty strings
        valid_texts = []
        valid_indices = []
        
        for idx, text in enumerate(texts):
            if text and isinstance(text, str) and text.strip():
                valid_texts.append(text)
                valid_indices.append(idx)
        
        if not valid_texts:
            return [False] * len(texts), [0.0] * len(texts)
        
        # Process valid texts
        results = classifier(valid_texts, candidate_labels, batch_size=len(valid_texts))
        
        # Handle single text case
        if isinstance(results, dict):
            results = [results]
        
        # Initialize results for all texts (including invalid ones)
        batch_results = [False] * len(texts)
        batch_scores = [0.0] * len(texts)
        
        # Update results for valid texts
        for valid_idx, result in zip(valid_indices, results):
            is_agri = (result['labels'][0] == 'agriculture related information' and 
                      result['scores'][0] >= confidence_threshold)
            confidence = result['scores'][0]
            
            if not is_agri:
                non_agri_answers.append({
                    'text': texts[valid_idx],
                    'confidence': result['scores'][1]
                })
            
            batch_results[valid_idx] = is_agri
            batch_scores[valid_idx] = confidence
            
        return batch_results, batch_scores
    
    # Process KccAns column with progress tracking
    print("Classifying texts...")
    
    # Convert to list for processing
    kcc_texts = df['KccAns'].to_list()
    
    # Process KccAns in larger batches
    print("Processing KccAns...")
    kcc_results = []
    kcc_scores = []
    
    # Process in larger batches for GPU efficiency
    batch_size = 256
    for i in tqdm(range(0, len(kcc_texts), batch_size), desc="Classifying KccAns"):
        batch = kcc_texts[i:i + batch_size]
        batch_results, batch_scores = process_batch(batch)
        kcc_results.extend(batch_results)
        kcc_scores.extend(batch_scores)
    
    # Create a filter for rows where KccAns is agriculture related
    valid_rows = kcc_results
    
    # Add confidence scores to the dataframe
    cleaned_df = df.with_columns([
        pl.Series(name="KccAns_AgriConfidence", values=kcc_scores)
    ]).filter(pl.Series(valid_rows))
    
    return cleaned_df, non_agri_answers

# Apply the cleaning
print(f"Starting with {master_df.height} rows")
cleaned_df, non_agri_answers = clean_responses_with_zeroshot(master_df)

print(f"\nFinal Results:")
print(f"Original rows: {master_df.height}")
print(f"Cleaned rows: {cleaned_df.height}")
print(f"Removed {master_df.height - cleaned_df.height} rows")

# Look at some examples of cleaned data
print("\nExample cleaned responses with confidence scores:")
print(cleaned_df.select(['KccAns', 'KccAns_AgriConfidence']).head(3))

# Print some examples of non-agriculture related texts
print("\nExample non-agriculture related answers:")
for item in non_agri_answers[:3]:
    print(f"Text: {item['text']}")
    print(f"Confidence (non-agri): {item['confidence']:.3f}\n")

# Save non-agricultural texts to a file
print("\nSaving non-agricultural texts to file...")
with open('non_agricultural_texts.txt', 'w', encoding='utf-8') as f:
    for item in non_agri_answers:
        f.write(f"Text: {item['text']}\n")
        f.write(f"Confidence (non-agri): {item['confidence']:.3f}\n\n")

# Print statistics about non-agriculture texts
print(f"\nTotal non-agriculture answers found: {len(non_agri_answers)}")

In [None]:
os.getcwd()

In [None]:
from transformers import pipeline
import polars as pl
from tqdm import tqdm

def clean_responses_with_zeroshot(df):
    print("Starting zero-shot classification cleaning process...")
    
    # Initialize the classifier
    print("Loading BART model...")
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    
    # Define labels
    candidate_labels = ['agriculture related information', 'non agriculture related information']
    
    # Lists to store non-agriculture related texts and their confidence scores
    non_agri_queries = []
    non_agri_answers = []
    
    def is_agriculture_related(text, text_type, confidence_threshold=0.5):
        if not text or not isinstance(text, str):
            return False, 0.0
        
        result = classifier(text, candidate_labels)
        is_agri = (result['labels'][0] == 'agriculture related information' and 
                result['scores'][0] >= confidence_threshold)
        confidence = result['scores'][0]
        
        # Store non-agri texts
        if not is_agri:
            if text_type == 'query':
                non_agri_queries.append({
                    'text': text,
                    'confidence': result['scores'][1]
                })
            else:
                non_agri_answers.append({
                    'text': text,
                    'confidence': result['scores'][1]
                })
        
        return is_agri, confidence
    
    # Process both columns with progress tracking
    print("Classifying texts...")
    
    # Convert to lists for processing
    query_texts = df['QueryText'].to_list()
    kcc_texts = df['KccAns'].to_list()
    
    # Process QueryText
    print("Processing QueryText...")
    query_results = []
    query_scores = []
    for text in tqdm(query_texts, desc="Classifying QueryText"):
        is_agri, confidence = is_agriculture_related(text, 'query')
        query_results.append(is_agri)
        query_scores.append(confidence)
    
    # Process KccAns
    print("Processing KccAns...")
    kcc_results = []
    kcc_scores = []
    for text in tqdm(kcc_texts, desc="Classifying KccAns"):
        is_agri, confidence = is_agriculture_related(text, 'answer')
        kcc_results.append(is_agri)
        kcc_scores.append(confidence)
    
    # Create a filter for rows where both columns are agriculture related
    valid_rows = [q and k for q, k in zip(query_results, kcc_results)]
    
    # Add confidence scores to the dataframe
    cleaned_df = df.with_columns([
        pl.Series(name="QueryText_AgriConfidence", values=query_scores),
        pl.Series(name="KccAns_AgriConfidence", values=kcc_scores)
    ]).filter(pl.Series(valid_rows))
    
    return cleaned_df, non_agri_queries, non_agri_answers

# Apply the cleaning
print(f"Starting with {master_df.height} rows")
cleaned_df, non_agri_queries, non_agri_answers = clean_responses_with_zeroshot(master_df)

print(f"\nFinal Results:")
print(f"Original rows: {master_df.height}")
print(f"Cleaned rows: {cleaned_df.height}")
print(f"Removed {master_df.height - cleaned_df.height} rows")

# Look at some examples of cleaned data
print("\nExample cleaned responses with confidence scores:")
print(cleaned_df.select(['QueryText', 'QueryText_AgriConfidence', 
                        'KccAns', 'KccAns_AgriConfidence']).head(3))

# Print some examples of non-agriculture related texts
print("\nExample non-agriculture related queries:")
for item in non_agri_queries[:3]:
    print(f"Text: {item['text']}")
    print(f"Confidence (non-agri): {item['confidence']:.3f}\n")

print("\nExample non-agriculture related answers:")
for item in non_agri_answers[:3]:
    print(f"Text: {item['text']}")
    print(f"Confidence (non-agri): {item['confidence']:.3f}\n")

# Print statistics about non-agriculture texts
print(f"\nTotal non-agriculture queries found: {len(non_agri_queries)}")
print(f"Total non-agriculture answers found: {len(non_agri_answers)}")

In [None]:
cleaned_df.shape

In [None]:
cleaned_df

In [None]:
! pip install ipywidgets

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest
import numpy as np

def clean_responses(df):
    # 1. Basic cleaning
    valid_answers = df.filter(
        pl.col("KccAns").is_not_null() &
        pl.col("KccAns").str.contains(r'[a-zA-Z]')
    )
    
    # 2. Get embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(valid_answers['KccAns'].to_list())
    
    # 3. Use Isolation Forest to detect outliers
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    predictions = iso_forest.fit_predict(embeddings)
    
    # 4. Keep only good responses using filter instead of boolean indexing
    cleaned_df = valid_answers.filter(pl.Series(predictions == 1))
    
    return cleaned_df

# Apply the cleaning
cleaned_df = clean_responses(master_df)
print(f"Original rows: {master_df.height}")
print(f"Cleaned rows: {cleaned_df.height}")

# Look at some examples
print("\nExample cleaned responses:")
print(cleaned_df['KccAns'].head(5))

In [None]:
cleaned_df.tail(200)

In [None]:
[master_df['KccAns'].head(10)]

In [43]:
# Just get top 50 values and their counts
# Basic value counts with percentage
value_counts = (
    master_df.get_column("Season")
    .value_counts(parallel=True)
    .with_columns([
        (pl.col("count") / pl.col("count").sum() * 100).alias("percentage")  # Note: "count" not "counts"
    ])
    .sort("count", descending=True) 
    .head(50)
)

In [None]:
value_counts

In [None]:
filtered_df = master_df.filter(pl.col("Crop") == "Others")

filtered_df['QueryText', 'KccAns'].head(50)

In [None]:
master_df.columns