In [1]:
import gc
gc.collect()

6

In [2]:
import polars as pl # type: ignore
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
import psutil
import os
from typing import Dict, Optional, Union, List
from pathlib import Path
import numpy as np # type: ignore
import pandas as pd

In [3]:
def clear_memory(df_to_remove: Optional[Union[pl.DataFrame, List[pl.DataFrame]]] = None) -> None:
    """
    Clear memory and print memory usage statistics
    """
    try:
        # Print initial state
        print("\nInitial memory state:")
        process = psutil.Process(os.getpid())
        initial_memory_mb = process.memory_info().rss / 1024 / 1024
        print(f"Current Memory Usage: {initial_memory_mb:.2f} MB ({initial_memory_mb/1024:.2f} GB)")
        
        # Remove specific DataFrame if provided
        if df_to_remove is not None:
            if isinstance(df_to_remove, list):
                for df in df_to_remove:
                    # Get the variable name
                    name = [var_name for var_name, var_val in globals().items() if var_val is df]
                    if name:
                        globals().pop(name[0], None)
                    del df
            else:
                name = [var_name for var_name, var_val in globals().items() if var_val is df_to_remove]
                if name:
                    globals().pop(name[0], None)
                del df_to_remove
                
        # Force garbage collection
        gc.collect()
        
        # Get new memory info
        new_memory_mb = process.memory_info().rss / 1024 / 1024
        memory_freed = initial_memory_mb - new_memory_mb
        
        print(f"\nCurrent Memory Usage: {new_memory_mb:.2f} MB ({new_memory_mb/1024:.2f} GB)")
        print(f"Available System Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Memory Utilization: {psutil.virtual_memory().percent}%")
        
        if memory_freed > 0:
            print(f"Memory freed: {memory_freed:.2f} MB")
            
    except Exception as e:
        print(f"Memory cleanup failed: {e}")

In [4]:
def load_data(
    file_path: str,
    schema_overrides: Optional[Dict] = None,
    columns_to_drop: Optional[List[str]] = None,
    low_memory: bool = True
) -> Optional[pl.DataFrame]:
    """
    Load and process a CSV file using Polars
    
    Args:
        file_path: Path to the CSV file
        schema_overrides: Dictionary of column names and their data types
        columns_to_drop: List of column names to drop
        low_memory: Whether to use low memory mode
    
    Returns:
        Processed Polars DataFrame or None if error occurs
    """
    try:
        # Ensure file exists
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")
            
        print("Initial memory state:")
        clear_memory()
        
        print("\nLoading CSV file...")
        # Read CSV with provided schema overrides
        df = pl.read_csv(
            file_path,
            schema_overrides=schema_overrides or {},
            low_memory=low_memory
        )
        
        # Drop specified columns if any
        if columns_to_drop:
            df = df.drop(columns_to_drop)
            
        print("\nAfter loading CSV:")
        clear_memory()
        
        return df
        
    except Exception as e:
        print(f"Error in data processing: {e}")
        return None

In [5]:
new_path = Path('/home/manimala/Documents/satyakama/paper-farmer-chatbot/dataset/')
os.chdir(new_path)

In [6]:
# Define schema overrides for your specific CSV
schema = {
    'Year': pl.Int32,
    'Month': pl.Int32,
    'Day': pl.Int32,
    'Crop': pl.Utf8,
    'DistrictName': pl.Utf8,
    'QueryType': pl.Utf8,
    'Season': pl.Utf8,
    'Sector': pl.Utf8,
    'StateName': pl.Utf8,
    'QueryText': pl.Utf8,
    'KccAns': pl.Utf8,
    'Category': pl.Utf8,
    'BlockName': pl.Utf8
}

# Specify columns to drop
columns_to_drop = ['BlockName', 'Category']

# Load your data (replace with your actual file path)
master_df = load_data(
    file_path='original_dataset/kcc_dataset.csv',  
    schema_overrides=schema,
    columns_to_drop=columns_to_drop,
    low_memory=True
)

Initial memory state:

Initial memory state:
Current Memory Usage: 159.51 MB (0.16 GB)

Current Memory Usage: 159.51 MB (0.16 GB)
Available System Memory: 243.03 GB
Memory Utilization: 3.2%

Loading CSV file...

After loading CSV:

Initial memory state:
Current Memory Usage: 11856.12 MB (11.58 GB)

Current Memory Usage: 11856.12 MB (11.58 GB)
Available System Memory: 231.59 GB
Memory Utilization: 7.7%


In [7]:
from typing import List, Dict, Tuple, Optional
from functools import reduce
import operator

def state_df(polars_df: pl.DataFrame, 
             column_name: str, 
             state_name: str,
             columns_to_clean: List[str] = None,
             fill_nulls: Dict[str, str] = None,
             digit_check_columns: List[str] = None,
             remove_digit_only: bool = True) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, Dict]:
    """
    Filter DataFrame by state, clean null values, and remove digit-only values.
    
    Args:
        polars_df: Input Polars DataFrame
        column_name: Column name containing state information
        state_name: State name to filter by
        columns_to_clean: List of column names where nulls should be dropped
        fill_nulls: Dictionary of {column_name: fill_value} for replacing nulls
        digit_check_columns: List of columns to check for digit-only values
        remove_digit_only: Whether to remove rows with digit-only values
    
    Returns:
        tuple: (cleaned_df, null_analysis_df, digit_analysis_df, stats_dict)
    """
    try:
        # First filter by state
        df = polars_df.filter(pl.col(column_name) == state_name)
        
        print(f"\nFiltered shape for {state_name}: {df.shape}")
        reduction = ((len(polars_df) - len(df)) / len(polars_df) * 100)
        print(f'Percentage of rows reduced: {reduction:.2f}%')
        
        # Store original shape for comparison
        total_rows = df.shape[0]
        
        # Create null value analysis DataFrame
        null_analysis = (
            pl.DataFrame({
                'Column': df.columns,
                'Total_Values': total_rows,
                'Null_Count': df.null_count().row(0),
            })
            .with_columns([
                (pl.col('Null_Count') / pl.col('Total_Values') * 100)
                .round(2)
                .alias('Null_Percentage')
            ])
            .sort('Null_Percentage', descending=True)
        )
        
        print("\nNull value analysis:")
        print(null_analysis)
        
        # Create a copy of the DataFrame for cleaning
        cleaned_df = df.clone()
        
        # Drop nulls from specified columns
        if columns_to_clean:
            cleaned_df = cleaned_df.drop_nulls(subset=columns_to_clean)
            print(f"\nShape after removing nulls: {cleaned_df.shape}")
            
            null_check = cleaned_df.select(columns_to_clean).null_count()
            print("\nNull counts in specified columns after cleaning:")
            print(null_check)
        
        # Fill nulls in specified columns with provided values
        if fill_nulls:
            for column, fill_value in fill_nulls.items():
                if column in cleaned_df.columns:
                    cleaned_df = cleaned_df.with_columns(
                        pl.col(column).fill_null(value=fill_value)
                    )
                    print(f"\nUnique values in {column} after filling nulls:")
                    print(cleaned_df[column].unique())
                else:
                    print(f"\nWarning: Column '{column}' not found in DataFrame")

        # Analyze and remove digit-only values
        if digit_check_columns:
            initial_shape = cleaned_df.shape
            total_rows = initial_shape[0]
            
            # Analyze digit-only values
            results = []
            for col in digit_check_columns:
                digit_only_count = cleaned_df.filter(
                    pl.col(col).cast(pl.Utf8).str.contains(r'^\d+$')
                ).height
                
                percentage = (digit_only_count / total_rows * 100)
                
                results.append({
                    'Column': col,
                    'Total_Values': total_rows,
                    'Digit_Only_Count': digit_only_count,
                    'Percentage': round(percentage, 2)
                })
            
            digit_analysis = pl.DataFrame(results).sort('Percentage', descending=True)
            
            print("\nPercentage of digit-only values in each column:")
            print(digit_analysis)
            
            if remove_digit_only:
                filter_conditions = [
                    ~pl.col(col).cast(pl.Utf8).str.contains(r'^\d+$')
                    for col in digit_check_columns
                ]
                
                final_condition = reduce(operator.and_, filter_conditions)
                cleaned_df = cleaned_df.filter(final_condition)
                
                final_shape = cleaned_df.shape
                rows_removed = initial_shape[0] - final_shape[0]
                
                stats = {
                    'initial_rows': initial_shape[0],
                    'final_rows': final_shape[0],
                    'rows_removed': rows_removed,
                    'removal_percentage': round((rows_removed / initial_shape[0]) * 100, 2)
                }
                
                print(f"\nRows removed: {rows_removed:,} ({stats['removal_percentage']}%)")
                print(f"Final shape: {final_shape}")
                
                # Verification
                verification = []
                for col in digit_check_columns:
                    digit_only_count = cleaned_df.filter(
                        pl.col(col).cast(pl.Utf8).str.contains(r'^\d+$')
                    ).height
                    verification.append({
                        'Column': col,
                        'Digit_Only_Count': digit_only_count,
                        'Percentage': round((digit_only_count / final_shape[0]) * 100, 2)
                    })
                
                verification_df = pl.DataFrame(verification)
                stats['verification'] = verification_df
            else:
                stats = {'message': 'No digit-only value cleaning performed'}
        else:
            digit_analysis = None
            stats = {'message': 'No digit analysis performed'}
        
        return cleaned_df, null_analysis, digit_analysis, stats
        
    except Exception as e:
        print(f"Error in state_df: {e}")
        return df, None, None, {'error': str(e)}

# Example usage:
columns_to_clean = [
    'KccAns',
    'QueryType',
    'Crop',
    'Sector',
    'QueryText'
]

fill_null_values = {
    'Season': 'Unspecified'
}

digit_check_columns = [
    'QueryType',
    'Crop',
    'Season',
    'Sector',
    'KccAns',
    'QueryText'
]

wb_cleaned, wb_null_analysis, wb_digit_analysis, stats = state_df(
    polars_df=master_df,
    column_name='StateName',
    state_name='WEST BENGAL',
    columns_to_clean=columns_to_clean,
    fill_nulls=fill_null_values,
    digit_check_columns=digit_check_columns,
    remove_digit_only=True
)


Filtered shape for WEST BENGAL: (1209983, 11)
Percentage of rows reduced: 97.12%

Null value analysis:
shape: (11, 4)
┌──────────────┬──────────────┬────────────┬─────────────────┐
│ Column       ┆ Total_Values ┆ Null_Count ┆ Null_Percentage │
│ ---          ┆ ---          ┆ ---        ┆ ---             │
│ str          ┆ i32          ┆ i64        ┆ f64             │
╞══════════════╪══════════════╪════════════╪═════════════════╡
│ Season       ┆ 1209983      ┆ 746004     ┆ 61.65           │
│ KccAns       ┆ 1209983      ┆ 212566     ┆ 17.57           │
│ QueryType    ┆ 1209983      ┆ 15799      ┆ 1.31            │
│ Crop         ┆ 1209983      ┆ 3187       ┆ 0.26            │
│ Sector       ┆ 1209983      ┆ 812        ┆ 0.07            │
│ Year         ┆ 1209983      ┆ 0          ┆ 0.0             │
│ Month        ┆ 1209983      ┆ 0          ┆ 0.0             │
│ Day          ┆ 1209983      ┆ 0          ┆ 0.0             │
│ DistrictName ┆ 1209983      ┆ 0          ┆ 0.0             │

In [8]:
wb_cleaned = wb_cleaned.with_columns([
    pl.concat_str([
        pl.col("QueryText"),
        pl.col("KccAns")
    ], separator=" ").alias("QApairs")
])

In [9]:
wb_cleaned = wb_cleaned.with_columns([
    pl.concat_str([
        pl.col('Year').cast(pl.Utf8),
        pl.col('Month').cast(pl.Utf8).str.pad_start(2, '0'),  # Ensure 2 digits
        pl.col('Day').cast(pl.Utf8).str.pad_start(2, '0')     # Ensure 2 digits
    ], separator='-').alias('date_str')
    .str.strptime(pl.Datetime, format='%Y-%m-%d')
    .alias('timestamp')
])

In [10]:
wb_cleaned.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,QApairs,timestamp
i32,i32,i32,str,str,str,str,str,str,str,str,str,datetime[μs]
2009,6,5,"""Grape""","""DARJEELING""","""Cultural Practices""","""JAYAD""","""HORTICULTURE""","""WEST BENGAL""","""WANT TO KNOW ABOUT GRAPE CULTI…","""ANSWER GIVEN IN DETAILS""","""WANT TO KNOW ABOUT GRAPE CULTI…",2009-06-05 00:00:00
2009,1,24,"""Guava""","""JALPAIGURI""","""Water Management""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT ROT IN GUAVA""","""APPLY STREPTOCYCLINE 1GM10 LI…","""FRUIT ROT IN GUAVA APPLY STREP…",2009-01-24 00:00:00
2009,1,24,"""Guava""","""JALPAIGURI""","""Water Management""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT ROT IN GUAVA""","""APPLY STREPTOCYCLINE 1GM10 LI…","""FRUIT ROT IN GUAVA APPLY STREP…",2009-01-24 00:00:00
2009,2,9,"""Banana""","""JALPAIGURI""","""Cultural Practices""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""CULTIVATION OF BANANA""","""ANS GIVEN IN DETAILS""","""CULTIVATION OF BANANA ANS GIVE…",2009-02-09 00:00:00
2009,4,24,"""Guava""","""JALPAIGURI""","""Water Management""","""JAYAD""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT DROPPING IN GUAVA ALONG …","""APPLY BORAX 120 GMPLANT AT RO…","""FRUIT DROPPING IN GUAVA ALONG …",2009-04-24 00:00:00


In [11]:
# Now you can use it with BERTopic
documents = wb_cleaned['QApairs'].to_list()
timestamps = wb_cleaned['timestamp'].to_list()

In [12]:
wb_cleaned.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,QApairs,timestamp
i32,i32,i32,str,str,str,str,str,str,str,str,str,datetime[μs]
2009,6,5,"""Grape""","""DARJEELING""","""Cultural Practices""","""JAYAD""","""HORTICULTURE""","""WEST BENGAL""","""WANT TO KNOW ABOUT GRAPE CULTI…","""ANSWER GIVEN IN DETAILS""","""WANT TO KNOW ABOUT GRAPE CULTI…",2009-06-05 00:00:00
2009,1,24,"""Guava""","""JALPAIGURI""","""Water Management""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT ROT IN GUAVA""","""APPLY STREPTOCYCLINE 1GM10 LI…","""FRUIT ROT IN GUAVA APPLY STREP…",2009-01-24 00:00:00
2009,1,24,"""Guava""","""JALPAIGURI""","""Water Management""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT ROT IN GUAVA""","""APPLY STREPTOCYCLINE 1GM10 LI…","""FRUIT ROT IN GUAVA APPLY STREP…",2009-01-24 00:00:00
2009,2,9,"""Banana""","""JALPAIGURI""","""Cultural Practices""","""KHARIF""","""HORTICULTURE""","""WEST BENGAL""","""CULTIVATION OF BANANA""","""ANS GIVEN IN DETAILS""","""CULTIVATION OF BANANA ANS GIVE…",2009-02-09 00:00:00
2009,4,24,"""Guava""","""JALPAIGURI""","""Water Management""","""JAYAD""","""HORTICULTURE""","""WEST BENGAL""","""FRUIT DROPPING IN GUAVA ALONG …","""APPLY BORAX 120 GMPLANT AT RO…","""FRUIT DROPPING IN GUAVA ALONG …",2009-04-24 00:00:00


In [13]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
sentence_model = SentenceTransformer('all-mpnet-base-v2')

In [18]:
limited_df = wb_cleaned.head(1000).with_columns([
    pl.concat_str([
        pl.col('Year').cast(pl.Utf8),
        pl.col('Month').cast(pl.Utf8).str.pad_start(2, '0'),
        pl.col('Day').cast(pl.Utf8).str.pad_start(2, '0')
    ], separator='-')
    .str.strptime(pl.Datetime, format='%Y-%m-%d')
    .alias('timestamp')
])

In [15]:
topic_model = BERTopic(
    embedding_model=sentence_model,
    min_topic_size=20,  # adjust based on your dataset size
    nr_topics='auto',
    n_gram_range=(1, 3),  # consider phrases up to trigrams
    # diversity=0.3  # adjust for more diverse topic representations
)

In [19]:
# Verify the limited dataframe
print(f"Number of rows in limited_df: {limited_df.shape[0]}")
print("\nFirst few timestamps:")
print(limited_df.select('timestamp').head(3))

Number of rows in limited_df: 1000

First few timestamps:
shape: (3, 1)
┌─────────────────────┐
│ timestamp           │
│ ---                 │
│ datetime[μs]        │
╞═════════════════════╡
│ 2009-06-05 00:00:00 │
│ 2009-01-24 00:00:00 │
│ 2009-01-24 00:00:00 │
└─────────────────────┘


In [20]:
# Check the first few documents
print("\nFirst document format:")
print(type(limited_df['QApairs'][0]))
print("\nLength of documents list:", len(documents))


First document format:
<class 'str'>

Length of documents list: 880704


In [24]:
# Let's look at the first few complete examples
print("First 3 QA pairs (full text):")
for i, qa in enumerate(limited_df['QApairs'].head(3)):
    print(f"\n{i+1}. Content:", qa)
    print(f"   Length: {len(str(qa))}")
    print("-" * 80)

# Basic statistics using polars
print("\nBasic length statistics:")
length_stats = limited_df.select([
    (pl.col('QApairs').cast(pl.String).map_elements(lambda x: len(str(x)))).alias('length')
]).describe()
print(length_stats)

# Count very short and very long QAs
qa_lengths = limited_df.with_columns([
    pl.col('QApairs').cast(pl.String).map_elements(lambda x: len(str(x))).alias('qa_length')
])

print("\nLength distribution:")
print(f"QAs shorter than 100 chars: {(qa_lengths['qa_length'] < 100).sum()}")
print(f"QAs longer than 1000 chars: {(qa_lengths['qa_length'] > 1000).sum()}")

First 3 QA pairs (full text):

1. Content: WANT TO KNOW ABOUT GRAPE CULTIVATION ANSWER GIVEN IN DETAILS
   Length: 60
--------------------------------------------------------------------------------

2. Content: FRUIT ROT IN GUAVA APPLY STREPTOCYCLINE  1GM10 LIT OF WATER
   Length: 59
--------------------------------------------------------------------------------

3. Content: FRUIT ROT IN GUAVA APPLY STREPTOCYCLINE  1GM10 LIT OF WATER
   Length: 59
--------------------------------------------------------------------------------

Basic length statistics:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ length    │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 1000.0    │
│ null_count ┆ 0.0       │
│ mean       ┆ 57.494    │
│ std        ┆ 26.330653 │
│ min        ┆ 23.0      │
│ 25%        ┆ 44.0      │
│ 50%        ┆ 50.0      │
│ 75%        ┆ 63.0      │
│ max        ┆ 446.0     │
└────────────┴───────────┘

Length distribution



In [27]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Using a smaller, faster model since texts are short
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize BERTopic with parameters suited for short texts
topic_model = BERTopic(
    embedding_model=sentence_model,
    min_topic_size=15,  # Smaller size since dataset is small
    n_gram_range=(1, 3),  # Include phrases up to trigrams
    top_n_words=10,     # Show more words per topic
    verbose=True
)

# Simple tqdm usage
documents = list(tqdm(
    limited_df['QApairs'].cast(pl.String).to_list(),
    desc="Processing Documents"
))

# Fit model
topics, probs = topic_model.fit_transform(documents)

# After fitting, examine results
print("\nTop Topics:")
print(topic_model.get_topic_info().head(10))

print("\nWords in Topic 0:")
print(topic_model.get_topic(0))


Processing Documents: 100%|██████████| 1000/1000 [00:00<00:00, 3949438.79it/s]
2025-02-05 10:54:07,755 - BERTopic - Embedding - Transforming documents to embeddings.

[A
[A
Batches: 100%|██████████| 32/32 [00:00<00:00, 101.56it/s]
2025-02-05 10:54:08,079 - BERTopic - Embedding - Completed ✓
2025-02-05 10:54:08,079 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
Exception ignored in: <function tqdm.__del__ at 0x7a215f580e00>
Traceback (most recent call last):
  File "/home/manimala/miniconda3/envs/rag/lib/python3.11/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/home/manimala/miniconda3/envs/rag/lib/python3.11/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7a215f580e00>
Traceback (most recent call last):
  File "/home/manimala/miniconda3/envs/ra


Top Topics:
   Topic  Count                                               Name  \
0     -1     80  -1_cultivation_cultivation of_of_in details cu...   
1      0     81                         0_soil_soil test_test_dose   
2      1     75  1_report answer given_report answer_weather re...   
3      2     53   2_wheat_of wheat_cultivation of wheat_irrigation   
4      3     53  3_cultural_details cultural_in details cultura...   
5      4     52      4_sunflower_of sunflower_cucumber_of cucumber   
6      5     51        5_weather_information_weather report_report   
7      6     50                    6_gm_coconut apply_coconut_dose   
8      7     45                                 7_to_about_cow_bee   
9      8     35  8_sesamum_of sesamum_cultivation of sesamum_of...   

                                      Representation  \
0  [cultivation, cultivation of, of, in details c...   
1  [soil, soil test, test, dose, fertilizer, fert...   
2  [report answer given, report answer, weather 

In [29]:
print(topic_model.get_topic_info().head(10))

   Topic  Count                                               Name  \
0     -1     80  -1_cultivation_cultivation of_of_in details cu...   
1      0     81                         0_soil_soil test_test_dose   
2      1     75  1_report answer given_report answer_weather re...   
3      2     53   2_wheat_of wheat_cultivation of wheat_irrigation   
4      3     53  3_cultural_details cultural_in details cultura...   
5      4     52      4_sunflower_of sunflower_cucumber_of cucumber   
6      5     51        5_weather_information_weather report_report   
7      6     50                    6_gm_coconut apply_coconut_dose   
8      7     45                                 7_to_about_cow_bee   
9      8     35  8_sesamum_of sesamum_cultivation of sesamum_of...   

                                      Representation  \
0  [cultivation, cultivation of, of, in details c...   
1  [soil, soil test, test, dose, fertilizer, fert...   
2  [report answer given, report answer, weather r...   
3  [w

In [16]:
from tqdm.notebook import tqdm

In [17]:


# Get documents and timestamps with progress bar
documents = tqdm(limited_df['QApairs'].to_list(), 
                desc="Processing Documents",
                leave=True)  # keeps the progress bar after completion
timestamps = limited_df['timestamp'].to_list()

# Fit model
topics, probs = topic_model.fit_transform(documents)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html