In [1]:
import gc
gc.collect()

6

In [2]:
import polars as pl
import psutil
import os
import gc
from typing import Dict, Optional, Union, List
from pathlib import Path

In [3]:
def clear_memory(df_to_remove: Optional[Union[pl.DataFrame, List[pl.DataFrame]]] = None) -> None:
    """
    Clear memory and print memory usage statistics
    """
    try:
        # Print initial state
        print("\nInitial memory state:")
        process = psutil.Process(os.getpid())
        initial_memory_mb = process.memory_info().rss / 1024 / 1024
        print(f"Current Memory Usage: {initial_memory_mb:.2f} MB ({initial_memory_mb/1024:.2f} GB)")
        
        # Remove specific DataFrame if provided
        if df_to_remove is not None:
            if isinstance(df_to_remove, list):
                for df in df_to_remove:
                    # Get the variable name
                    name = [var_name for var_name, var_val in globals().items() if var_val is df]
                    if name:
                        globals().pop(name[0], None)
                    del df
            else:
                name = [var_name for var_name, var_val in globals().items() if var_val is df_to_remove]
                if name:
                    globals().pop(name[0], None)
                del df_to_remove
                
        # Force garbage collection
        gc.collect()
        
        # Get new memory info
        new_memory_mb = process.memory_info().rss / 1024 / 1024
        memory_freed = initial_memory_mb - new_memory_mb
        
        print(f"\nCurrent Memory Usage: {new_memory_mb:.2f} MB ({new_memory_mb/1024:.2f} GB)")
        print(f"Available System Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Memory Utilization: {psutil.virtual_memory().percent}%")
        
        if memory_freed > 0:
            print(f"Memory freed: {memory_freed:.2f} MB")
            
    except Exception as e:
        print(f"Memory cleanup failed: {e}")

In [4]:
# Third cell - Define load_data function
def load_data(
    file_path: str,
    schema_overrides: Optional[Dict] = None,
    columns_to_drop: Optional[List[str]] = None,
    low_memory: bool = True
) -> Optional[pl.DataFrame]:
    """
    Load and process a CSV file using Polars
    
    Args:
        file_path: Path to the CSV file
        schema_overrides: Dictionary of column names and their data types
        columns_to_drop: List of column names to drop
        low_memory: Whether to use low memory mode
    
    Returns:
        Processed Polars DataFrame or None if error occurs
    """
    try:
        # Ensure file exists
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")
            
        print("Initial memory state:")
        clear_memory()
        
        print("\nLoading CSV file...")
        # Read CSV with provided schema overrides
        df = pl.read_csv(
            file_path,
            schema_overrides=schema_overrides or {},
            low_memory=low_memory
        )
        
        # Drop specified columns if any
        if columns_to_drop:
            df = df.drop(columns_to_drop)
            
        print("\nAfter loading CSV:")
        clear_memory()
        
        return df
        
    except Exception as e:
        print(f"Error in data processing: {e}")
        return None

In [5]:
# Define schema overrides for your specific CSV
schema = {
    'Year': pl.Int32,
    'Month': pl.Int32,
    'Day': pl.Int32,
    'Crop': pl.Utf8,
    'DistrictName': pl.Utf8,
    'QueryType': pl.Utf8,
    'Season': pl.Utf8,
    'Sector': pl.Utf8,
    'StateName': pl.Utf8,
    'QueryText': pl.Utf8,
    'KccAns': pl.Utf8,
    'Category': pl.Utf8,
    'BlockName': pl.Utf8
}

# Specify columns to drop
columns_to_drop = ['BlockName', 'Category']

# Load your data (replace with your actual file path)
master_df = load_data(
    file_path='dataset/original_dataset/kcc_dataset.csv',  # Replace with your actual file path
    schema_overrides=schema,
    columns_to_drop=columns_to_drop,
    low_memory=True
)



Initial memory state:

Initial memory state:
Current Memory Usage: 82.55 MB (0.08 GB)

Current Memory Usage: 82.55 MB (0.08 GB)
Available System Memory: 245.69 GB
Memory Utilization: 2.1%

Loading CSV file...

After loading CSV:

Initial memory state:
Current Memory Usage: 11778.02 MB (11.50 GB)

Current Memory Usage: 11778.02 MB (11.50 GB)
Available System Memory: 234.15 GB
Memory Utilization: 6.7%


In [6]:
master_df.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns
i32,i32,i32,str,str,str,str,str,str,str,str
2006,1,17,"""1275""","""SAGAR""","""99""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control flower drop in …","""spray planofix4mlpump"""
2006,1,17,"""964""","""SAGAR""","""Disease Management""","""RABI""","""ANIMAL HUSBANDRY""","""MADHYA PRADESH""","""how tyo control diseases in bu…",
2006,1,17,"""1279""","""SAGAR""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control fruit borer in …","""should be spray profenophos 35…"
2006,1,17,"""1064""","""SAGAR""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control of yellow moisa…","""should be spray metasystox 35m…"
2006,1,17,"""1279""","""DAMOH""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control white fly in br…","""should be spray metasystox 35m…"


In [7]:
master_df.shape

(41987874, 11)

##### Filtering: Null analysis and cleaning

In [8]:
def analyze_and_clean_nulls(
    df: pl.DataFrame,
    columns_to_clean: List[str],
    fill_nulls: Dict[str, str] = None
) -> tuple[pl.DataFrame, pl.DataFrame]:
    """
    Analyze null values in a DataFrame and clean them according to specified rules.
    
    Args:
        df: Input Polars DataFrame
        columns_to_clean: List of column names where nulls should be dropped
        fill_nulls: Dictionary of {column_name: fill_value} for replacing nulls
    
    Returns:
        tuple: (cleaned_df, null_analysis_df)
    """
    try:
        # Store original shape for comparison
        total_rows = df.shape[0]
        print(f"Original shape: {df.shape}")
        
        # Create null value analysis DataFrame
        null_analysis = (
            pl.DataFrame({
                'Column': df.columns,
                'Total_Values': total_rows,
                'Null_Count': df.null_count().row(0),
            })
            .with_columns([
                # Calculate null percentage for each column
                (pl.col('Null_Count') / pl.col('Total_Values') * 100)
                .round(2)
                .alias('Null_Percentage')
            ])
            .sort('Null_Percentage', descending=True)  # Sort by percentage descending
        )
        
        print("\nNull value analysis:")
        print(null_analysis)
        
        # Create a copy of the DataFrame for cleaning
        cleaned_df = df.clone()
        
        # Drop nulls from specified columns
        if columns_to_clean:
            cleaned_df = cleaned_df.drop_nulls(subset=columns_to_clean)
            print(f"\nShape after removing nulls: {cleaned_df.shape}")
            
            # Optional: Verify null removal
            null_check = cleaned_df.select(columns_to_clean).null_count()
            print("\nNull counts in specified columns after cleaning:")
            print(null_check)
        
        # Fill nulls in specified columns with provided values
        if fill_nulls:
            for column, fill_value in fill_nulls.items():
                if column in cleaned_df.columns:
                    cleaned_df = cleaned_df.with_columns(
                        pl.col(column).fill_null(value=fill_value)
                    )
                    # Print unique values to verify replacement
                    print(f"\nUnique values in {column} after filling nulls:")
                    print(cleaned_df[column].unique())
                else:
                    print(f"\nWarning: Column '{column}' not found in DataFrame")
        
        return cleaned_df, null_analysis
        
    except Exception as e:
        print(f"Error in analyze_and_clean_nulls: {e}")
        return df, None


In [9]:
# Define columns where nulls should be dropped
columns_to_clean = [
    'KccAns',
    'QueryType',
    'Crop',
    'Sector',
    'QueryText'
]

# Define columns where nulls should be filled with specific values
fill_null_values = {
    'Season': 'Unspecified'
}

# Run the analysis and cleaning
cleaned_df, null_analysis = analyze_and_clean_nulls(
    df=master_df,
    columns_to_clean=columns_to_clean,
    fill_nulls=fill_null_values
)

Original shape: (41987874, 11)

Null value analysis:
shape: (11, 4)
┌──────────────┬──────────────┬────────────┬─────────────────┐
│ Column       ┆ Total_Values ┆ Null_Count ┆ Null_Percentage │
│ ---          ┆ ---          ┆ ---        ┆ ---             │
│ str          ┆ i32          ┆ i64        ┆ f64             │
╞══════════════╪══════════════╪════════════╪═════════════════╡
│ Season       ┆ 41987874     ┆ 26665089   ┆ 63.51           │
│ KccAns       ┆ 41987874     ┆ 4320412    ┆ 10.29           │
│ QueryType    ┆ 41987874     ┆ 1333503    ┆ 3.18            │
│ Crop         ┆ 41987874     ┆ 172930     ┆ 0.41            │
│ Sector       ┆ 41987874     ┆ 85083      ┆ 0.2             │
│ …            ┆ …            ┆ …          ┆ …               │
│ Year         ┆ 41987874     ┆ 0          ┆ 0.0             │
│ Month        ┆ 41987874     ┆ 0          ┆ 0.0             │
│ Day          ┆ 41987874     ┆ 0          ┆ 0.0             │
│ DistrictName ┆ 41987874     ┆ 0          ┆ 0.0  

In [10]:
cleaned_df.shape

(36267631, 11)

In [11]:
%who DataFrame

cleaned_df	 master_df	 null_analysis	 


In [12]:
# Clear any existing DataFrames
clear_memory(df_to_remove= [master_df, null_analysis])


Initial memory state:
Current Memory Usage: 15596.65 MB (15.23 GB)

Current Memory Usage: 15596.65 MB (15.23 GB)
Available System Memory: 232.14 GB
Memory Utilization: 7.5%
