In [2]:
import numpy as np
import pandas as pd

In [None]:
raw_data = pd.read_csv('../data/Crop_Data.csv')

In [3]:
raw_data.shape

(214471, 25)

In [4]:
df = raw_data.copy()

# Data Cleaning

During EDA, we saw that there exists crops in dataset for which production is always zero. We need to remove them as they whould not convey any information to the model.

In [5]:
# Method to remove crops with zero production
def remove_zero_production_crops(data):
    '''
    Method to remove crops with zero production
    
    Parameters:
    data - Dataframe which should have 'Crop' and 'Production' columns.
    '''
    crop_net_production = data.groupby('Crop')['Production'].sum().sort_values()
    crops_with_zero_production = crop_net_production.loc[crop_net_production == 0].index
    data = data.loc[~data['Crop'].isin(crops_with_zero_production)].reset_index(drop=True)
    print('Following crops had net production as zero in dataset:')
    print(crops_with_zero_production)
    return data

In [6]:
df = remove_zero_production_crops(df)

Following crops had net production as zero in dataset:
Index(['Apple', 'Litchi', 'Cauliflower', 'Snak Guard', 'Ribed Guard',
       'Cucumber', 'Carrot', 'Lab-Lab', 'Plums', 'Peas  (vegetable)',
       'Bottle Gourd', 'Peach', 'Bitter Gourd', 'Pear', 'Ber', 'Beet Root',
       'Pump Kin', 'Turnip', 'Water Melon', 'Other Dry Fruit', 'Ash Gourd',
       'Yam', 'other fibres', 'Redish', 'Other Citrus Fruit'],
      dtype='object', name='Crop')


In [7]:
df.shape

(213554, 25)

There exist crops with name like 'Other Kharif pulses', 'other oilseeds', 'other misc. pulses', etc.

These are not specific crop names but a category, hence they may not have similar features to predict their production.

Removing these kind of crop names would help model to learn from Crop feature without any ambiguity.

In [8]:
# Method to remove records for 'other...' crops
def remove_other_crops(data):
    '''
    Method to remove records for 'other...' crops
    
    Parameters:
    data - Dataframe which should have 'Crop' column.
    '''
    other_crops = [crop for crop in data['Crop'].unique() if 'other' in crop.lower()]
    data = data.loc[~data['Crop'].isin(other_crops)].reset_index(drop=True)
    print("Removed records with following crop names:")
    print(other_crops)
    return data

In [9]:
df = remove_other_crops(df)

Removed records with following crop names:
['Other Kharif pulses', 'other oilseeds', 'other misc. pulses', 'Other  Rabi pulses', 'Other Fresh Fruits', 'Other Vegetables', 'Other Cereals & Millets', 'Cond-spcs other']


In [10]:
df.shape

(206233, 25)

In [11]:
# df.to_csv('../data/Crop_Data__cleaned.csv', index=False)