### Garbage collection

In [2]:
# Clear IPython's global namespace
%reset -f

# Reimport gc module
import gc
# Run garbage collection
gc.collect()

# Clear all cell outputs
from IPython.display import clear_output
clear_output(wait=True)

### Reading data

In [3]:
import os
os.chdir('/home/manimala/Documents/satyakama/paper-farmer-chatbot/')

In [4]:
import polars as pl 
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
pl.Config.set_tbl_cols(-1)  # Show all columns (-1 means no limit)
pl.Config.set_fmt_str_lengths(1000)  # Increase maximum string length

polars.config.Config

In [37]:
def preprocess_kcc_dataset(file_path: str) -> pl.DataFrame:
   """
   Preprocess the KCC dataset by cleaning and transforming the data.
   
   Args:
       file_path (str): Path to the KCC dataset CSV file
   
   Returns:
       pl.DataFrame: Cleaned and preprocessed DataFrame
   """
   
   # Read CSV with selected columns
   master_df = pl.read_csv(
       source=file_path,
       columns=[
           'Year', 'Month', 'Day', 'Crop', 'BlockName', 
           'DistrictName', 'QueryType', 'Season', 'Sector',
           'StateName', 'QueryText', 'KccAns'
       ],
       has_header=True,
       low_memory=True
   )
   
   # Convert all column values to uppercase
   master_df = master_df.with_columns([
       pl.all().cast(pl.Utf8).str.to_uppercase()
   ])
   
   # Create Date column and drop individual date columns
   master_df = master_df.with_columns(
       pl.format("{}-{}-{}",
           pl.col("Day").cast(pl.Utf8).str.zfill(2),
           pl.col("Month").cast(pl.Utf8).str.zfill(2),
           pl.col("Year")
       ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
   ).drop(['Day', 'Month', 'Year'])
   
   print(f"Original rows: {master_df.shape[0]}")
   
   # Remove rows containing digits in specific columns
   columns_to_check = ['BlockName', 'Crop', 'QueryType', 'Sector']
   for col in columns_to_check:
       master_df = master_df.filter(
           ~pl.col(col).str.contains(r"\d")
       )
       print(f"Rows after cleaning {col}: {master_df.shape[0]}")
   
   # Remove rows where QueryText and KccAns contain only numbers
   numeric_pattern = r"^[-]?[0-9]*\.?[0-9]+$"
   for col in ['QueryText', 'KccAns']:
       master_df = master_df.filter(
           ~pl.col(col).str.contains(numeric_pattern)
       )
       print(f"Rows after cleaning {col}: {master_df.shape[0]}")
   
   # Replace null and "0" values in Season column with "UNSPECIFIED"
   master_df = master_df.with_columns([
       pl.when(pl.col("Season").is_null() | (pl.col("Season") == "0"))
       .then(pl.lit("UNSPECIFIED"))
       .otherwise(pl.col("Season"))
       .alias("Season")
   ])
   
   # Remove rows with any null values
   initial_count = len(master_df)
   master_df = master_df.drop_nulls()
   print(f"Rows removed due to null values: {initial_count - len(master_df)}")
   
   # Verify null values statistics
   null_stats = (
       pl.DataFrame({
           "column": master_df.columns,
           "null_count": [master_df[col].null_count() for col in master_df.columns]
       })
       .with_columns([
           (pl.col("null_count") / len(master_df) * 100).round(2).alias("null_percentage")
       ])
       .sort("null_percentage", descending=True)
   )
   
   print("\nNull value statistics after preprocessing:")
   print(null_stats)
   
   return master_df

# Usage example:
cleaned_df = preprocess_kcc_dataset('dataset/original_dataset/kcc_dataset.csv')

Original rows: 41987874
Rows after cleaning BlockName: 39237022
Rows after cleaning Crop: 38359698
Rows after cleaning QueryType: 36493030
Rows after cleaning Sector: 36493029
Rows after cleaning QueryText: 36484242
Rows after cleaning KccAns: 32641854
Rows removed due to null values: 0

Null value statistics after preprocessing:
shape: (10, 3)
┌──────────────┬────────────┬─────────────────┐
│ column       ┆ null_count ┆ null_percentage │
│ ---          ┆ ---        ┆ ---             │
│ str          ┆ i64        ┆ f64             │
╞══════════════╪════════════╪═════════════════╡
│ BlockName    ┆ 0          ┆ 0.0             │
│ Crop         ┆ 0          ┆ 0.0             │
│ DistrictName ┆ 0          ┆ 0.0             │
│ QueryType    ┆ 0          ┆ 0.0             │
│ Season       ┆ 0          ┆ 0.0             │
│ Sector       ┆ 0          ┆ 0.0             │
│ StateName    ┆ 0          ┆ 0.0             │
│ QueryText    ┆ 0          ┆ 0.0             │
│ KccAns       ┆ 0          ┆

In [38]:
master_df = pl.read_csv(
    source= 'dataset/original_dataset/kcc_dataset.csv',
    columns= ['Year',
        'Month',
        'Day',
        'Crop',
        'BlockName',
        'DistrictName',
        'QueryType',
        'Season',
        'Sector',
        'StateName',
        'QueryText',
        'KccAns'],
    has_header= True,
    low_memory= True

    
)

# Convert all column values to uppercase
master_df = master_df.with_columns([
    pl.all().cast(pl.Utf8).str.to_uppercase()
])

# FILTER LOGIC 
# Creating a new column for Date
master_df = master_df.with_columns(
    pl.format("{}-{}-{}", 
        pl.col("Day").cast(pl.Utf8).str.zfill(2),
        pl.col("Month").cast(pl.Utf8).str.zfill(2),
        pl.col("Year")
    ).str.strptime(pl.Date, format="%d-%m-%Y").alias("Date")
)
# Drop the 3 redundant columns
master_df = master_df.drop(['Day', 'Month', 'Year'])
print(f"Original rows: {master_df.shape[0]}")

# Filter out rows where BlockName contains digits
master_df = master_df.filter(
    ~pl.col("BlockName").str.contains(r"\d")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Filter out rows where Crop contains digits
master_df = master_df.filter(
    ~pl.col("Crop").str.contains(r"\d")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Filter out rows where QueryType contains digits
master_df = master_df.filter(
    ~pl.col("QueryType").str.contains(r"\d")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Filter out rows where Sector contains digits
master_df = master_df.filter(
    ~pl.col("Sector").str.contains(r"\d")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Filter out rows where QueryText contains only numbers
master_df = master_df.filter(
    ~pl.col("QueryText").str.contains(r"^[-]?[0-9]*\.?[0-9]+$")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Filter out rows where KccAns contains only numbers
master_df = master_df.filter(
    ~pl.col("KccAns").str.contains(r"^[-]?[0-9]*\.?[0-9]+$")
)

print(f"Reduced rows: {master_df.shape[0]}")

# Replace both null and "0" with "UNSPECIFIED"
master_df = master_df.with_columns([
    pl.when(pl.col("Season").is_null() | (pl.col("Season") == "0"))
    .then(pl.lit("UNSPECIFIED"))
    .otherwise(pl.col("Season"))
    .alias("Season")
])

# Drop rows with any null values
master_df = master_df.drop_nulls()

# Print the new row count
print(f"Row count after dropping nulls: {len(master_df)}")

# Verify that there are no more nulls
null_stats = (
    pl.DataFrame({
        "column": master_df.columns,
        "null_count": [master_df[col].null_count() for col in master_df.columns]
    })
    .with_columns([
        (pl.col("null_count") / len(master_df) * 100).round(2).alias("null_percentage")
    ])
    .sort("null_percentage", descending=True)
)

print("\nNull value statistics after dropping:")
print(null_stats)


Original rows: 41987874
Reduced rows: 39237022
Reduced rows: 38359698
Reduced rows: 36493030
Reduced rows: 36493029
Reduced rows: 36484242
Reduced rows: 32641854
Row count after dropping nulls: 32641854

Null value statistics after dropping:
shape: (10, 3)
┌──────────────┬────────────┬─────────────────┐
│ column       ┆ null_count ┆ null_percentage │
│ ---          ┆ ---        ┆ ---             │
│ str          ┆ i64        ┆ f64             │
╞══════════════╪════════════╪═════════════════╡
│ BlockName    ┆ 0          ┆ 0.0             │
│ Crop         ┆ 0          ┆ 0.0             │
│ DistrictName ┆ 0          ┆ 0.0             │
│ QueryType    ┆ 0          ┆ 0.0             │
│ Season       ┆ 0          ┆ 0.0             │
│ Sector       ┆ 0          ┆ 0.0             │
│ StateName    ┆ 0          ┆ 0.0             │
│ QueryText    ┆ 0          ┆ 0.0             │
│ KccAns       ┆ 0          ┆ 0.0             │
│ Date         ┆ 0          ┆ 0.0             │
└──────────────┴───────

In [31]:
master_df.head()

BlockName,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,Date
str,str,str,str,str,str,str,str,str,date
"""MOHANPUR""","""COCONUT""","""SAMASTIPUR""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""HORTICULTURE""","""BIHAR""","""FERTILIZER DOSES OF COCONUT""","""FERTILIZER ARE NPK 1:2:2 KGPLANT """,2007-01-05
"""DOLONGGHAT""","""BANANA""","""NAGAON""","""FERTILIZER USE AND AVAILABILITY""","""JAYAD""","""HORTICULTURE""","""ASSAM""","""ASKING ABOUT THE FERTILIZER SCHEDULE FOR BANANA CULTIVATION""","""SUGGESTED TO APPLY UREA242GRAMPLANTSSP206GRAMPLANTMOP551GRAMPLANT AND COMPOST12KGPLANT IN TRENCH METHOD""",2009-09-29
"""DANIYAWAN""","""WHEAT""","""PATNA""","""FERTILIZER USE AND AVAILABILITY""","""KHARIF""","""AGRICULTURE""","""BIHAR""","""ASKING ABOUT FERTILISER DOSE OF WHEAT""","""ASKING ABOUT FERTILISER DOSE OF WHEAT ARE 120KG N60KG P40KG KHECT FOR SOWING STAGE""",2009-12-23
"""AKHORIGOLA""","""CABBAGE""","""ROHTAS""","""CULTURAL PRACTICES""","""KHARIF""","""HORTICULTURE""","""BIHAR""","""EARLY CULTIVAR OF CABBAGE""","""PUSA DRUM HEAD""",2009-02-22
"""HATHUA""","""GLADIOLUS""","""GOPALGANJ""","""CULTURAL PRACTICES""","""RABI""","""HORTICULTURE""","""BIHAR""","""METHOD OF GLADIOLUS CULTIVATION""","""ANSWER GIVEN IN DETAILS """,2009-05-28


In [32]:
value_counts = (
    master_df.get_column("Season")
    .value_counts(parallel=True)
    .with_columns([
        (pl.col("count") / pl.col("count").sum() * 100).alias("percentage")  # Note: "count" not "counts"
    ])
    .sort("count", descending=True) 
    .head(100)
)

print(value_counts)

shape: (4, 3)
┌─────────────┬──────────┬────────────┐
│ Season      ┆ count    ┆ percentage │
│ ---         ┆ ---      ┆ ---        │
│ str         ┆ u32      ┆ f64        │
╞═════════════╪══════════╪════════════╡
│ UNSPECIFIED ┆ 21306600 ┆ 65.273866  │
│ KHARIF      ┆ 5159857  ┆ 15.807488  │
│ RABI        ┆ 3923597  ┆ 12.020141  │
│ JAYAD       ┆ 2251800  ┆ 6.898505   │
└─────────────┴──────────┴────────────┘


In [25]:
# Filter rows where the column contains ONLY numbers (including decimals)
your_column_name = "QueryText"

filtered_df = master_df.filter(
    pl.col(your_column_name).str.contains(r"^[-]?[0-9]*\.?[0-9]+$")
)

# Print examples of rows with only numeric values
print("Examples of cells containing only numbers:")
print(master_df.filter(
    pl.col(your_column_name).str.contains(r"^[-]?[0-9]*\.?[0-9]+$")
).select(your_column_name).unique().head(10))

# Print count of such rows
print(f"Number of rows with only numeric values: {filtered_df.shape[0]}")

Examples of cells containing only numbers:
shape: (10, 1)
┌─────────────┐
│ QueryText   │
│ ---         │
│ str         │
╞═════════════╡
│ 8762612075  │
│ 9813125938  │
│ 9858758341  │
│ 9449167750  │
│ 9461555859  │
│ 04872375855 │
│ 8197500150  │
│ 9989623809  │
│ 08023086100 │
│ 9893295091  │
└─────────────┘
Number of rows with only numeric values: 1634
