In [2]:
import pandas as pd

In [16]:
data= pd.read_csv("north_india_crops_augmented.csv")

In [17]:
data.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,55.451049,34.759023,68.474255,22.055635,77.501063,6.963747,796.866368,almond
1,65.487804,35.36659,51.400261,16.546457,82.125037,6.499078,719.580897,almond
2,60.03409,33.827064,50.564195,18.878279,63.595961,7.150979,730.626227,almond
3,67.03554,38.268171,54.173506,19.991211,76.113196,7.507708,812.872838,almond
4,67.861981,35.337592,73.73832,24.680909,86.299608,6.659872,718.605649,almond


In [18]:
data.describe()


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,30530.0,30530.0,30530.0,30530.0,30530.0,30530.0,30530.0
mean,67.318959,49.229495,53.992459,24.624678,73.534671,6.264561,604.73893
std,37.817408,20.130845,40.141726,5.624915,15.386077,0.784655,527.626778
min,0.0,0.0,0.0,0.008489,0.241239,3.504752,0.0
25%,42.146217,39.278988,24.0,20.891188,65.135813,5.62,107.413725
50%,60.453968,45.753459,45.0,25.46,78.95372,6.34424,579.75
75%,89.894356,60.0,71.171207,28.5,83.621437,6.738652,888.613421
max,194.789434,150.0,500.0,46.732778,99.748924,9.983822,3322.06


In [19]:
duplicate_columns = ['N','P','K','temperature','humidity','ph','rainfall','label']

In [20]:
data.duplicated(subset=duplicate_columns, keep='first').sum()

np.int64(0)

In [None]:
df_cleaned = data.drop_duplicates()

In [None]:
df_cleaned.duplicated(subset=duplicate_columns, keep='first').sum()

np.int64(0)

In [None]:
df_cleaned.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,31061.0,31061.0,31061.0,31061.0,31061.0,31061.0,31061.0
mean,65.358135,47.960133,54.15058,25.010339,73.661701,6.205709,633.915258
std,38.677499,20.857843,41.028616,5.830112,16.059557,0.955917,601.272698
min,0.0,0.0,0.0,-9.770358,0.241239,0.005469,0.0
25%,35.536146,36.898631,23.0,21.806596,65.092249,5.554824,107.086209
50%,60.0,44.0,44.0,25.833333,79.35014,6.218554,535.932899
75%,88.182468,60.0,70.894401,28.648182,83.951944,6.704871,965.094721
max,194.670114,150.0,500.0,47.861322,104.98097,13.989251,3322.06


In [None]:
output_file = 'cleaned_data_output.csv'

In [None]:
df_cleaned.to_csv(output_file, index=False)
print(f"Successfully saved DataFrame to {output_file}")

Successfully saved DataFrame to cleaned_data_output.csv


In [1]:
import pandas as pd
import numpy as np



In [2]:
# 1. Load the Data
# ---------------------------------------------------------
try:
    df = pd.read_csv('crop_data2.0.csv')
    print(f"Data Loaded. Original Shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'crop_data2.0.csv' not found. Make sure the file is in the same directory.")
    exit()



Data Loaded. Original Shape: (31061, 8)


In [3]:
# 2. Fix Physical Impossibilities
# ---------------------------------------------------------
# Problem: Humidity cannot be > 100%.
# Fix: Clip values to 100.
df['humidity'] = df['humidity'].clip(upper=100)
print("- Fixed humidity values > 100%")



- Fixed humidity values > 100%


In [4]:
# 3. Normalize Labels (The "Duplicate" Fix)
# ---------------------------------------------------------
# Problem: 'Rice' vs 'rice', 'Cotton' vs 'cotton'.
# Fix: Convert everything to lowercase and strip whitespace.
df['label'] = df['label'].str.lower().str.strip()
print("- Normalized labels to lowercase (merged duplicates like Rice/rice)")



- Normalized labels to lowercase (merged duplicates like Rice/rice)


In [5]:
# 4. Resolve Ambiguity & Synonyms
# ---------------------------------------------------------
# Problem: 'bean' is too generic (overlaps with kidneybeans). 'millet' overlaps with 'jowar'.
# Fix: Rename ambiguous classes to their specific likely biological counterparts based on your data profile.
rename_map = {
    # Ambiguous / Generic -> Specific
    'bean': 'french_bean',    # Renamed: Data shows high rainfall/humidity needs, unlike dry beans.
    'millet': 'pearl_millet', # Renamed: Distinguishes it from Jowar (Sorghum).
    'pepper': 'bell_pepper',  # Renamed: avoids confusion with blackpepper/chili.

    # Formatting / Synonyms
    'brinjal': 'eggplant',       # Standardization
    'sweetpotato': 'sweet_potato', # Formatting
    'maize': 'corn',             # Standardization (if present)
    'pigeonpeas': 'pigeon_peas', # Formatting
    'kidneybeans': 'kidney_beans', # Formatting
    'mothbeans': 'moth_beans',   # Formatting
    'blackgram': 'black_gram',   # Formatting
    'horsegram': 'horse_gram',   # Formatting
}

df['label'] = df['label'].replace(rename_map)
print("- Renamed ambiguous classes (e.g., bean -> french_bean, millet -> pearl_millet)")



- Renamed ambiguous classes (e.g., bean -> french_bean, millet -> pearl_millet)


In [6]:
# 5. Handle Severe Class Imbalance (The "Honest" Fix)
# ---------------------------------------------------------
# Problem: 'bittergourd' has 18 samples, 'rice' has 1300+. Your model will ignore the small classes.
# Fix: Upsample minority classes to a minimum threshold so the model actually sees them.
# WARNING: This is a patch. The real fix is to collect more data.

MIN_SAMPLES = 200 # Minimum samples per crop
balanced_dfs = []

print(f"- Balancing data: Upsampling minority classes to {MIN_SAMPLES} samples...")

for label, group in df.groupby('label'):
    if len(group) < MIN_SAMPLES:
        # Randomly sample with replacement to reach MIN_SAMPLES
        upsampled = group.sample(MIN_SAMPLES, replace=True, random_state=42)
        balanced_dfs.append(upsampled)
    else:
        balanced_dfs.append(group)

df_final = pd.concat(balanced_dfs).reset_index(drop=True)



- Balancing data: Upsampling minority classes to 200 samples...


In [7]:
# 6. Save the Cleaned File
# ---------------------------------------------------------
output_filename = 'crop_data_fixed.csv'
df_final.to_csv(output_filename, index=False)

print(f"\nSUCCESS! Fixed data saved to: {output_filename}")
print(f"Final Shape: {df_final.shape}")
print("\nNew Class Distribution (Top & Bottom):")
print(df_final['label'].value_counts().iloc[[0, 1, -2, -1]])


SUCCESS! Fixed data saved to: crop_data_fixed.csv
Final Shape: (38111, 8)

New Class Distribution (Top & Bottom):
label
rice        1365
corn        1243
yam          200
zucchini     200
Name: count, dtype: int64


## new processing top 80 crops


In [8]:
import pandas as pd

# 1. Load the data
# Replace 'crop_data2.0.csv' with your actual file path if different
try:
    df = pd.read_csv('crop_data_fixed.csv')
except FileNotFoundError:
    print("Error: File not found.")
    exit()



In [9]:
# 2. Fix Physical Errors (Humidity > 100%)
df['humidity'] = df['humidity'].clip(upper=100)

# 3. Normalize Labels (Lowercase and strip spaces)
df['label'] = df['label'].str.lower().str.strip()



In [10]:
# 4. Rename Ambiguous & Duplicate Classes
# This standardizes names (e.g., 'bean' -> 'french_bean') and merges duplicates.
rename_map = {
    'bean': 'french_bean',
    'millet': 'pearl_millet',
    'pepper': 'bell_pepper',
    'brinjal': 'eggplant',
    'sweetpotato': 'sweet_potato',
    'maize': 'corn',
    'pigeonpeas': 'pigeon_peas',
    'kidneybeans': 'kidney_beans',
    'mothbeans': 'moth_beans',
    'blackgram': 'black_gram',
    'horsegram': 'horse_gram',
    'groundnuts': 'peanut',
    'groundnut': 'peanut',
    'potatoes': 'potato',
    'soyabeans': 'soyabean',
    'jowar': 'sorghum',
    'ladyfinger': 'okra'
}
df['label'] = df['label'].replace(rename_map)



In [11]:
# 5. Define the Top 80 Crops for North India (Punjab, UP, Uttarakhand)
north_india_crops = [
    # Cereals & Cash Crops
    'rice', 'wheat', 'corn', 'sugarcane', 'cotton', 'sorghum', 'pearl_millet',
    'barley', 'oat', 'ragi', 'tobacco', 'jute',

    # Pulses
    'chickpea', 'lentil', 'pigeon_peas', 'moong', 'black_gram', 'kidney_beans',
    'soyabean', 'horse_gram', 'moth_beans', 'french_bean', 'peas',

    # Oilseeds
    'rapeseed', 'mustard', 'sunflower', 'peanut', 'sesame', 'linseed',
    'safflower', 'castor',

    # Vegetables
    'potato', 'onion', 'tomato', 'cauliflower', 'cabbage', 'eggplant', 'okra',
    'carrot', 'radish', 'spinach', 'cucumber', 'pumpkin', 'bottlegourd',
    'bittergourd', 'sweet_potato', 'turnip', 'beetroot', 'yam', 'taro',
    'bell_pepper', 'chili', 'zucchini', 'asparagus', 'broccoli', 'celery',
    'lettuce', 'squash',

    # Fruits (Plains)
    'mango', 'guava', 'orange', 'papaya', 'watermelon', 'muskmelon', 'lychee',
    'pomegranate', 'grapes', 'banana', 'strawberry',

    # Fruits (Hills)
    'apple', 'apricot', 'walnut', 'peach', 'plum', 'pear', 'cherry',
    'raspberry', 'blueberry', 'blackberry', 'almond'
]



In [12]:
# 6. Filter the Dataset
df_north = df[df['label'].isin(north_india_crops)].copy()



In [13]:
# 7. Save to New CSV
output_file = 'north_india_crops.csv'
df_north.to_csv(output_file, index=False)

print(f"Success. Filtered data saved to {output_file}")
print(f"Original Count: {len(df)}")
print(f"Filtered Count: {len(df_north)}")
print(f"Unique Crops: {df_north['label'].nunique()}")

Success. Filtered data saved to north_india_crops.csv
Original Count: 38111
Filtered Count: 30853
Unique Crops: 80


## 3rd fix of data ph and temp

In [1]:
import pandas as pd

# 1. Load your current filtered file
df = pd.read_csv('north_india_crops.csv')
original_count = len(df)

# 2. The "Physics" Filter
# Remove rows with chemically/biologically impossible values
# pH: Normal soil is 4.0 - 9.0. We allow 3.5 - 10.0 to be safe.
# Temp: Crops don't grow at -9C. We cutoff at 0C.
clean_df = df[
    (df['ph'] >= 3.5) & (df['ph'] <= 10) &
    (df['temperature'] >= 0) & (df['temperature'] <= 55)
]

# 3. Check what we lost
dropped_count = original_count - len(clean_df)
print(f"Dropped {dropped_count} rows of garbage data (Acid/Freezing).")
print(f"Remaining rows: {len(clean_df)}")

# 4. Save the Final Clean Version
clean_df.to_csv('north_india_crops_final.csv', index=False)
print("Saved clean file to: north_india_crops_final.csv")

Dropped 323 rows of garbage data (Acid/Freezing).
Remaining rows: 30530
Saved clean file to: north_india_crops_final.csv


## 4th fix of duplicates

In [15]:
import pandas as pd
import numpy as np

# 1. Load the file
try:
    df = pd.read_csv('north_india_crops_final.csv')
except FileNotFoundError:
    print("Error: 'north_india_crops_final.csv' not found.")
    exit()

def smart_augmentation(group, target_count=200):
    # Separate numeric columns for noise
    numeric_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']

    # STEP 1: Identify REAL unique data (Ignore the old copies)
    real_data = group.drop_duplicates(subset=numeric_cols)
    n_real = len(real_data)

    # STEP 2: Logic Check
    # If we have enough REAL data (e.g. Rice=1320), just use it. Don't truncate.
    if n_real >= target_count:
        return real_data

    # If we don't (e.g. Bittergourd=18), we need to generate synthetic data
    n_needed = target_count - n_real

    # Sample from the REAL unique rows only
    base_samples = real_data.sample(n_needed, replace=True)

    # Generate Noise (+/- 5% variance)
    noise = np.random.normal(0, 0.05, size=(n_needed, len(numeric_cols)))

    synthetic_data = base_samples.copy()
    for i, col in enumerate(numeric_cols):
        scale = abs(real_data[col].mean()) * 0.05  # 5% of the column's mean
        if scale == 0: scale = 0.01
        synthetic_data[col] = synthetic_data[col] + (noise[:, i] * scale)

    # combine real + synthetic
    return pd.concat([real_data, synthetic_data])

# 2. Apply logic
print("Fixing dataset (Removing duplicates, generating synthetic noise)...")
# 'group_keys=False' prevents index duplication issues
df_fixed = df.groupby('label', group_keys=False).apply(smart_augmentation)

# 3. Enforce Physics Constraints (Post-Noise Safety Check)
df_fixed['humidity'] = df_fixed['humidity'].clip(0, 100)
df_fixed['ph'] = df_fixed['ph'].clip(3.5, 10)
df_fixed['temperature'] = df_fixed['temperature'].clip(0, 55)
df_fixed['rainfall'] = df_fixed['rainfall'].clip(0, None)
df_fixed['N'] = df_fixed['N'].clip(0, None)
df_fixed['P'] = df_fixed['P'].clip(0, None)
df_fixed['K'] = df_fixed['K'].clip(0, None)

# 4. Save and Verify
output_file = 'north_india_crops_augmented.csv'
df_fixed.to_csv(output_file, index=False)

print(f"Success! Saved to: {output_file}")
print(f"Original Shape: {df.shape}")
print(f"Final Shape: {df_fixed.shape}")
print(f"Final Duplicates: {df_fixed.duplicated().sum()} (Should be 0)")

Fixing dataset (Removing duplicates, generating synthetic noise)...


  df_fixed = df.groupby('label', group_keys=False).apply(smart_augmentation)


Success! Saved to: north_india_crops_augmented.csv
Original Shape: (30530, 8)
Final Shape: (30530, 8)
Final Duplicates: 0 (Should be 0)
