In [1]:
import polars as pl
pl.Config.set_tbl_rows(1000)  # or whatever number of rows you want to see
import pathlib
import os
import psutil
import gc

In [2]:
def clear_memory(df_to_remove=None):
    """Clear memory and print memory usage statistics"""
    try:
        # Print initial state
        print("\nInitial memory state:")
        process = psutil.Process(os.getpid())
        initial_memory_mb = process.memory_info().rss / 1024 / 1024
        print(f"Current Memory Usage: {initial_memory_mb:.2f} MB ({initial_memory_mb/1024:.2f} GB)")
        
        # Remove specific DataFrame if provided
        if df_to_remove is not None:
            if isinstance(df_to_remove, list):
                # If a list of DataFrames is provided
                for df in df_to_remove:
                    del df
            else:
                # If a single DataFrame is provided
                del df_to_remove
        
        # Force garbage collection
        gc.collect()
        
        # Get new memory info
        new_memory_mb = process.memory_info().rss / 1024 / 1024
        memory_freed = initial_memory_mb - new_memory_mb
        
        print(f"\nCurrent Memory Usage: {new_memory_mb:.2f} MB ({new_memory_mb/1024:.2f} GB)")
        print(f"Available System Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Memory Utilization: {psutil.virtual_memory().percent}%")
        
        if memory_freed > 0:
            print(f"Memory freed: {memory_freed:.2f} MB")
            
    except Exception as e:
        print(f"Memory cleanup failed: {e}")

In [3]:
def main():
    """Main function to load and process the data"""
    try:
        print("Initial memory state:")
        clear_memory()
        
        print("\nLoading CSV file...")
        # Read CSV with updated schema_overrides parameter
        master_df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',
            schema_overrides={
                'Year': pl.Int32,
                'Month': pl.Int32,
                'Day': pl.Int32,
                'Crop': pl.Utf8,
                'DistrictName': pl.Utf8,
                'QueryType': pl.Utf8,
                'Season': pl.Utf8,
                'Sector': pl.Utf8,
                'StateName': pl.Utf8,
                'QueryText': pl.Utf8,
                'KccAns': pl.Utf8,
                'Category': pl.Utf8,
                'BlockName': pl.Utf8
            },
            low_memory=True
        ).drop(['BlockName', 'Category'])
        
        print("\nAfter loading CSV:")
        clear_memory()
        
        return master_df
        
    except Exception as e:
        print(f"Error in data processing: {e}")
        return None



In [4]:
# First clear any existing DataFrames
clear_memory()


Initial memory state:
Current Memory Usage: 75.33 MB (0.07 GB)

Current Memory Usage: 75.33 MB (0.07 GB)
Available System Memory: 11.35 GB
Memory Utilization: 26.6%


In [5]:
# Usage examples:
# Load the data
master_df = main()

Initial memory state:

Initial memory state:
Current Memory Usage: 75.33 MB (0.07 GB)

Current Memory Usage: 75.33 MB (0.07 GB)
Available System Memory: 11.42 GB
Memory Utilization: 26.1%

Loading CSV file...

After loading CSV:

Initial memory state:
Current Memory Usage: 9994.67 MB (9.76 GB)

Current Memory Usage: 10037.29 MB (9.80 GB)
Available System Memory: 4.85 GB
Memory Utilization: 68.7%


In [6]:
master_df.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns
i32,i32,i32,str,str,str,str,str,str,str,str
2006,1,17,"""1275""","""SAGAR""","""99""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control flower drop in …","""spray planofix4mlpump"""
2006,1,17,"""964""","""SAGAR""","""Disease Management""","""RABI""","""ANIMAL HUSBANDRY""","""MADHYA PRADESH""","""how tyo control diseases in bu…",
2006,1,17,"""1279""","""SAGAR""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control fruit borer in …","""should be spray profenophos 35…"
2006,1,17,"""1064""","""SAGAR""","""3""","""RABI""","""AGRICULTURE""","""MADHYA PRADESH""","""how to control of yellow moisa…","""should be spray metasystox 35m…"
2006,1,17,"""1279""","""DAMOH""","""76""","""RABI""","""HORTICULTURE""","""MADHYA PRADESH""","""how to control white fly in br…","""should be spray metasystox 35m…"


In [8]:
master_df['StateName'].unique()

StateName
str
"""MIZORAM"""
"""UTTAR PRADESH"""
"""KARNATAKA"""
"""CHHATTISGARH"""
"""SIKKIM"""
"""ASSAM"""
"""KERALA"""
"""DELHI"""
"""PUNJAB"""
"""WEST BENGAL"""


In [10]:
# Using filter() method for Polars DataFrame
maharashtra_df = master_df.filter(pl.col('StateName') == 'MAHARASHTRA')

# Print shape to verify
print(f"Shape of maharashtra_df: {maharashtra_df.shape}")

Shape of maharashtra_df: (4690479, 11)


In [12]:
clear_memory(df_to_remove=master_df)


Initial memory state:
Current Memory Usage: 9838.99 MB (9.61 GB)

Current Memory Usage: 9838.99 MB (9.61 GB)
Available System Memory: 3.08 GB
Memory Utilization: 80.1%


In [14]:
# FILTER CONDITION: Filter master_df to keep only QueryType with string values
maharashtra_df_stringQT = maharashtra_df.filter(~pl.col('QueryType').str.contains(r'\d'))

print(maharashtra_df_stringQT.shape)

(4201301, 11)


In [16]:
clear_memory(df_to_remove= maharashtra_df)


Initial memory state:
Current Memory Usage: 9974.99 MB (9.74 GB)

Current Memory Usage: 9974.99 MB (9.74 GB)
Available System Memory: 2.59 GB
Memory Utilization: 83.3%


In [18]:
maharashtra_df_stringCrop = maharashtra_df_stringQT.filter(~pl.col('Crop').str.contains(r'\d'))

print(maharashtra_df_stringCrop.shape)

(4175123, 11)


In [20]:
clear_memory(df_to_remove= maharashtra_df_stringQT)


Initial memory state:
Current Memory Usage: 9911.99 MB (9.68 GB)

Current Memory Usage: 9911.99 MB (9.68 GB)
Available System Memory: 2.21 GB
Memory Utilization: 85.7%


In [21]:
maharashtra_df_stringCrop.head()

Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns
i32,i32,i32,str,str,str,str,str,str,str,str
2007,1,1,"""Onion""","""AHMADNAGAR""","""Agriculture Mechanization""",,"""HORTICULTURE""","""MAHARASHTRA""","""blight on onion""","""copper oxycloride25ml10lit of …"
2007,1,13,"""Onion""","""AHMADNAGAR""","""Fertilizer Use and Availabilit…","""KHARIF""","""HORTICULTURE""","""MAHARASHTRA""","""ask fertlizers dose of onion ""","""fertlizers dose of onion 15:15…"
2007,1,13,"""Onion""","""AHMADNAGAR""","""Fertilizer Use and Availabilit…",,"""HORTICULTURE""","""MAHARASHTRA""","""fertilizer dose for onion""","""apply 135kg suphala45kg ujjwal…"
2007,1,14,"""Watermelon""","""KOLHAPUR""","""Fertilizer Use and Availabilit…","""RABI""","""HORTICULTURE""","""MAHARASHTRA""","""ask micronutrent on watermelon…","""spraying of microla 25 ml in 2…"
2007,1,1,"""Onion""","""AURANGABAD""","""Agriculture Mechanization""",,"""HORTICULTURE""","""MAHARASHTRA""","""blight on onion""","""copper oxycloride25ml10lit of …"


In [None]:
type_of_query = list(master_df['QueryType'].unique())

# First remove None values
valid_queries = [query for query in type_of_query if query is not None]

# Then search for fertilizer/fertiliser
fertilizer_queries = [query for query in valid_queries 
                     if 'fertilizer' in str(query).lower() or 'fertiliser' in str(query).lower()]

# Print the matches
print("Queries related to fertilizer/fertiliser:")
for query in fertilizer_queries:
    print(f"- {query}")

In [None]:
# Use .filter() with pl.col('QueryType').is_in() to subset the DataFrame
fertilizer_df = master_df.filter(pl.col('QueryType').is_in(fertilizer_queries))

# Print the shape to verify
print(f"Shape of fertilizer_df: {fertilizer_df.shape}")

# Optional: Display unique QueryTypes in the filtered DataFrame to verify
print("\nUnique QueryTypes in fertilizer_df:")
print(fertilizer_df['QueryType'].unique())

In [None]:
# FILTER CONDITION: Filter master_df to keep only QueryType with string values
master_df_filtered_QueryType = master_df.filter(~pl.col('QueryType').str.contains(r'\d'))
clear_memory(df_to_remove=master_df)  # Remove the original large DataFrame

print(master_df_filtered_QueryType.shape)

In [None]:
# FILTER CONDITION: Filter master_df to keep only QueryType with string values
master_df_filtered_Crop = master_df_filtered_QueryType.filter(~pl.col('Crop').str.contains(r'\d'))

print(master_df_filtered_Crop.shape)

In [None]:
clear_memory(df_to_remove=master_df)

In [None]:
master_df_filtered_QueryType.head()

In [None]:
master_df_filtered_QueryType['Crop'].unique()

In [None]:
result = (master_df_filtered_QueryType
    .select(pl.col('QueryType'))
    .group_by('QueryType')
    .agg(pl.count('QueryType').alias('count'))
    .with_columns([
        (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
    ])
    .sort('count', descending=True)
)

In [2]:
def clear_memory():
    """Clear memory and print memory usage statistics"""
    try:
        # Force garbage collection
        gc.collect()
        
        # Get current process
        process = psutil.Process(os.getpid())
        
        # Get memory info
        memory_mb = process.memory_info().rss / 1024 / 1024
        
        # Print memory info with more context
        print(f"\nCurrent Memory Usage: {memory_mb:.2f} MB ({memory_mb/1024:.2f} GB)")
        print(f"Available System Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Memory Utilization: {psutil.virtual_memory().percent}%")
        
        # Force garbage collection again
        gc.collect()
        
        # Get new memory info
        new_memory_mb = process.memory_info().rss / 1024 / 1024
        memory_freed = memory_mb - new_memory_mb
        
        if memory_freed > 0:
            print(f"Memory freed by garbage collection: {memory_freed:.2f} MB")
            
    except Exception as e:
        print(f"Memory cleanup failed: {e}")

In [3]:
def main():
    """Main function to load and process the data"""
    try:
        print("Initial memory state:")
        clear_memory()
        
        print("\nLoading CSV file...")
        # Read CSV with updated schema_overrides parameter
        master_df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',
            schema_overrides={
                'Year': pl.Int32,
                'Month': pl.Int32,
                'Day': pl.Int32,
                'Crop': pl.Utf8,
                'DistrictName': pl.Utf8,
                'QueryType': pl.Utf8,
                'Season': pl.Utf8,
                'Sector': pl.Utf8,
                'StateName': pl.Utf8,
                'QueryText': pl.Utf8,
                'KccAns': pl.Utf8,
                'Category': pl.Utf8,
                'BlockName': pl.Utf8
            },
            low_memory=True
        ).drop(['BlockName', 'Category'])
        
        print("\nAfter loading CSV:")
        clear_memory()
        
        return master_df
        
    except Exception as e:
        print(f"Error in data processing: {e}")
        return None

In [None]:
master_df = main()

In [None]:
master_df.shape

In [None]:
# FILTER CONDITION: Filter master_df to keep only QueryType with string values
master_df_filtered_QueryType = master_df.filter(~pl.col('QueryType').str.contains(r'\d'))

print(master_df_filtered_QueryType.shape)

In [None]:
# Verify the results
result = (master_df_filtered_QueryType
    .select(pl.col('QueryType'))
    .group_by('QueryType')
    .agg(pl.count('QueryType').alias('count'))
    .with_columns([
        (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
    ])
    .sort('count', descending=True)
)

In [11]:
import plotly.graph_objects as go

# Get top 10 rows
top_10 = master_df_filtered_QueryType.head(10).to_pandas()

# # Calculate the sum of percentages for remaining rows (Others)
# others_percentage = master_df_filtered_QueryType.slice(10).select('percentage').sum().item()

# # Create labels and values for the chart
# labels = list(top_10['QueryType']) + ['Others']
# values = list(top_10['percentage']) + [others_percentage]

# # Create the donut chart
# fig = go.Figure(data=[go.Pie(
#     labels=labels,
#     values=values,
#     hole=0.4,
#     textinfo='label+percent',
#     textposition='outside',  # Changed from 'inside' to 'outside'
#     showlegend=False,  # Changed to False to remove the legend
#     direction='clockwise',
#     sort=False
# )])

# # Update layout
# fig.update_layout(
#     title={
#         'text': 'What Indian Farmers Query on',
#         'y':0.95,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'
#     },
#     width=1200,
#     height=800,
#     font=dict(size=14)
# )

# fig.show()

In [None]:
top_10

In [None]:
master_df_filtered_QueryType.head()

In [None]:
((master_df.shape[0] - master_df_filtered_QueryType.shape[0])/(master_df.shape[0]))*199

In [None]:
master_df.head()

In [6]:
all_India_QueryType = (master_df
    .select(pl.col('QueryType'))
    # Add a filter to exclude QueryType containing numbers
    .filter(~pl.col('QueryType').str.contains(r'\d'))
    .group_by('QueryType')
    .agg(pl.count('QueryType').alias('count'))
    .with_columns([
        (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
    ])
    .sort('count', descending=True)
)

In [None]:
import plotly.graph_objects as go

# Get top 10 rows
top_10 = all_India_QueryType.head(10).to_pandas()

# Calculate the sum of percentages for remaining rows (Others)
others_percentage = all_India_QueryType.slice(10).select('percentage').sum().item()

# Create labels and values for the chart
labels = list(top_10['QueryType']) + ['Others']
values = list(top_10['percentage']) + [others_percentage]

# Create the donut chart
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.4,
    textinfo='label+percent',
    textposition='outside',  # Changed from 'inside' to 'outside'
    showlegend=False,  # Changed to False to remove the legend
    direction='clockwise',
    sort=False
)])

# Update layout
fig.update_layout(
    title={
        'text': 'Distribution of Query Types (Top 10)',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    width=1200,
    height=800,
    font=dict(size=14)
)

fig.show()

In [None]:
values

In [None]:
import plotly.graph_objects as go

# Get top 10 rows
top_10 = all_India_QueryType.head(10).to_pandas()

# Calculate the sum of percentages for remaining rows (Others)
others_percentage = all_India_QueryType.slice(10).select('percentage').sum().item()

# Create labels and values for the chart - multiply values by 100
labels = list(top_10['QueryType']) + ['Others']
values = [x * 100 for x in list(top_10['percentage'])] + [others_percentage * 100]

# Create the donut chart
fig = go.Figure(data=[go.Pie(
    labels=labels, 
    values=values,
    hole=0.4,
    textinfo='label+percent',
    textposition='inside',
    texttemplate='%{label}<br>%{percent:.1f}%',
    showlegend=True,
    direction='clockwise',
    sort=False,
    pull=[0.1] + [0] * len(labels[1:])
)])

# Update layout
fig.update_layout(
    title={
        'text': 'Distribution of Query Types (Top 10)',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    width=900,
    height=700,
    font=dict(size=12),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.1,
        xanchor="center",
        x=0.5
    )
)

fig.show()

In [None]:
all_India_QueryType

In [None]:
#!/usr/bin/env python3

# Import all required libraries
import polars as pl
import pathlib
import os
import psutil
import gc

def clear_memory():
    """Clear memory and print memory usage statistics"""
    try:
        # Force garbage collection
        gc.collect()
        
        # Get current process
        process = psutil.Process(os.getpid())
        
        # Get memory info
        memory_mb = process.memory_info().rss / 1024 / 1024
        
        # Print memory info with more context
        print(f"\nCurrent Memory Usage: {memory_mb:.2f} MB ({memory_mb/1024:.2f} GB)")
        print(f"Available System Memory: {psutil.virtual_memory().available / 1024 / 1024 / 1024:.2f} GB")
        print(f"Memory Utilization: {psutil.virtual_memory().percent}%")
            
        # Force garbage collection again
        gc.collect()
        
        # Get new memory info
        new_memory_mb = process.memory_info().rss / 1024 / 1024
        memory_freed = memory_mb - new_memory_mb
        
        if memory_freed > 0:
            print(f"Memory freed by garbage collection: {memory_freed:.2f} MB")
        
    except Exception as e:
        print(f"Memory cleanup failed: {e}")

def main():
    """Main function to load and process the data"""
    try:
        print("Initial memory state:")
        clear_memory()
        
        print("\nLoading CSV file...")
        # Read CSV with updated schema_overrides parameter
        master_df = pl.read_csv('dataset/original_dataset/kcc_dataset.csv',
            schema_overrides={
                'Year': pl.Int32,
                'Month': pl.Int32,
                'Day': pl.Int32,
                'Crop': pl.Utf8,
                'DistrictName': pl.Utf8,
                'QueryType': pl.Utf8,
                'Season': pl.Utf8,
                'Sector': pl.Utf8,
                'StateName': pl.Utf8,
                'QueryText': pl.Utf8,
                'KccAns': pl.Utf8,
                'Category': pl.Utf8,
                'BlockName': pl.Utf8
            },
            low_memory=True
        ).drop(['BlockName', 'Category'])
        
        print("\nAfter loading CSV:")
        clear_memory()
        
        return master_df
        
    except Exception as e:
        print(f"Error in data processing: {e}")
        return None

if __name__ == "__main__":
    result_df = main()

In [None]:
master_df.head()

In [None]:
master_df['QueryType']].value_counts()

In [None]:
master_df_cropInsurance.head()

In [None]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go

# Create a date column using pl.date
master_df_cropInsurance = master_df_cropInsurance.with_columns([
    pl.date(
        year=pl.col('Year'),
        month=pl.col('Month'),
        day=1
    ).alias('Date')
])

# Group by date and state, count occurrences
monthly_state_counts = (
    master_df_cropInsurance
    .group_by(['Date', 'StateName'])
    .agg(
        pl.count().alias('count')
    )
    .sort('Date')
)

# Convert to pandas for easier plotting with plotly
monthly_state_df = monthly_state_counts.to_pandas()

# Create line plot
fig = px.line(
    monthly_state_df,
    x='Date',
    y='count',
    color='StateName',
    title='Crop Insurance Queries by State Over Time',
    labels={
        'Date': 'Month-Year',
        'count': 'Number of Queries',
        'StateName': 'State'
    }
)

# Customize layout
fig.update_layout(
    xaxis_title="Month-Year",
    yaxis_title="Number of Queries",
    legend_title="State",
    hovermode='x unified',
    template='plotly_white',
    # Improve readability
    xaxis=dict(
        tickangle=45,
        tickformat='%b %Y'
    ),
    # Add some margins for better display
    margin=dict(t=50, b=100)
)

# Add hover data
fig.update_traces(
    hovertemplate='<b>%{y}</b> queries<br>%{x|%B %Y}<extra></extra>'
)

# Show the plot
fig.show()

# Optional: Save the plot
# fig.write_html("crop_insurance_queries.html")

In [None]:
wb_no_weather = master_df.filter(
    (pl.col('QueryType') != 'Weather') & 
    (~pl.col('QueryType').str.contains(r'^[0-9]+$'))
)

# Verify the results
result = (master_df
    .select(pl.col('QueryType'))
    .group_by('QueryType')
    .agg(pl.count('QueryType').alias('count'))
    .with_columns([
        (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
    ])
    .sort('count', descending=True)
)

In [None]:
master_df['QueryType'].value_counts()

In [None]:
state_list = list(master_df['StateName'].unique())

print(state_list)

In [11]:
wb = master_df.filter(master_df['StateName'] == 'WEST BENGAL')

In [None]:
wb.shape

In [None]:
wb.head()

In [7]:
import polars as pl
pl.Config.set_tbl_rows(100)  # or whatever number of rows you want to see

# First, let's create a filter that identifies if a string is numeric
wb_no_weather = master_df.filter(
    (pl.col('QueryType') != 'Weather') & 
    (~pl.col('QueryType').str.contains(r'^[0-9]+$'))
)

# Verify the results
result = (wb_no_weather
    .select(pl.col('QueryType'))
    .group_by('QueryType')
    .agg(pl.count('QueryType').alias('count'))
    .with_columns([
        (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
    ])
    .sort('count', descending=True)
)

In [None]:
result

In [None]:
import polars as pl
pl.Config.set_tbl_rows(100)  # or whatever number of rows you want to see

# Then run your query
result = (wb_no_weather
 .select(pl.col('QueryType'))
 .group_by('QueryType')
 .agg(pl.count('QueryType').alias('count'))
 .with_columns([
     (pl.col('count') / pl.col('count').sum() * 100).round(2).alias('percentage')
 ])
 .sort('count', descending=True))

result  # Display the result

In [None]:
list(result['QueryType'])