> In this notebook, wheat data is undersampled such that ratio of wheat/mustard is atleast 70:30. Also, all the values before sowing period and 1 FN after harvest are made zeroes.

# Import Modules and Data

In [2]:
from glob import glob
import geopandas as gp
import numpy as np
import pandas as pd
from copy import deepcopy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
import random

pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings("ignore")

import os

os.chdir('C:\\Users\\user\\Krishna\\mncfc_crop_classification')

In [3]:
# Importing all the required files
wheat_train_val = pd.read_csv(r'data_files\data_share\preprocessed_wheat.csv')
mustard_train_val = pd.read_csv(r'data_files\data_share\preprocessed_mustard.csv')

wheat_test = pd.read_csv(r'data_files/data_share/preprocessed_new_wheat.csv')
mustard_test = pd.read_csv(r'data_files/data_share/preprocessed_new_mustard.csv')

# Dropping the unnecessary features as of now
wheat_train_val = wheat_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)
mustard_train_val = mustard_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)

In [15]:
fns = list(wheat_train_val.columns[:-3])

def sowing_diff(row):
    sowing_plus_two = row.iloc[fns.index(row['sowing_period']) + 2]
    sowing_ndvi = row.iloc[fns.index(row['sowing_period'])]
    return sowing_plus_two - sowing_ndvi

def harvest_diff(row):
    harvest_plus_one = row.iloc[fns.index(row['harvest_period']) + 1]
    harvest_ndvi = row.iloc[fns.index(row['harvest_period'])]
    return harvest_ndvi - harvest_plus_one

In [18]:
wheat_train_val['sowing_diff'] = wheat_train_val.apply(sowing_diff, axis=1)
wheat_train_val['harvest_diff'] = wheat_train_val.apply(harvest_diff, axis=1)

mustard_train_val['sowing_diff'] = mustard_train_val.apply(sowing_diff, axis=1)
mustard_train_val['harvest_diff'] = mustard_train_val.apply(harvest_diff, axis=1)

In [19]:
wheat_train_val.harvest_diff.describe(), mustard_train_val.harvest_diff.describe()

(count    16543.000000
 mean        36.823067
 std         16.422435
 min          2.000000
 25%         27.000000
 50%         35.000000
 75%         45.000000
 max        191.000000
 Name: harvest_diff, dtype: float64,
 count    1285.000000
 mean       31.449805
 std        15.699972
 min         2.000000
 25%        23.000000
 50%        30.000000
 75%        37.000000
 max       175.000000
 Name: harvest_diff, dtype: float64)

In [20]:
wheat_train_val.sowing_diff.describe(), mustard_train_val.sowing_diff.describe()

(count    16543.000000
 mean        48.526446
 std         12.094238
 min          0.000000
 25%         41.000000
 50%         50.000000
 75%         57.000000
 max         88.000000
 Name: sowing_diff, dtype: float64,
 count    1285.000000
 mean       51.325292
 std        10.682694
 min        17.000000
 25%        44.000000
 50%        53.000000
 75%        59.000000
 max        77.000000
 Name: sowing_diff, dtype: float64)

# Undersampling - (70:30)

In [8]:
# Not using the combinations, where the count is just 1

display(pd.crosstab(wheat_train_val['sowing_period'], wheat_train_val['harvest_period'], margins=True))
display(pd.crosstab(mustard_train_val['sowing_period'], mustard_train_val['harvest_period'], margins=True))

harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dec_1f,0,0,0,2033,3112,5145
dec_2f,0,0,0,19,166,185
nov_1f,118,1433,1,1954,181,3687
nov_2f,10,1082,0,4505,1761,7358
oct_2f,59,68,1,28,12,168
All,187,2583,2,8539,5232,16543


harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nov_1f,34,296,1,375,18,724
nov_2f,2,4,0,13,1,20
oct_2f,148,319,11,60,3,541
All,184,619,12,448,22,1285


In [9]:
#Removing 'Wheat_oct_2f - Wheat_jan_2f' and 'Wheat_nov_1f - Wheat_jan_2f' combinations as they are just 1 in count and cant be used for stratification

wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'nov_1f') | (wheat_train_val.harvest_period != 'jan_2f')]
wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'oct_2f') | (wheat_train_val.harvest_period != 'jan_2f')]                                 

#Removing 'Mustard_nov_1f - Mustard_jan_2f' and 'Mustard_nov_2f - Mustard_mar_2f' combinations as they are just 1 in count and cant be used for stratification
mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_1f') | (mustard_train_val.harvest_period != 'jan_2f')]
mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_2f') | (mustard_train_val.harvest_period != 'mar_2f')]          


In [10]:
target_wheat = int(mustard_train_val.shape[0] * 0.7/0.3)

split_per = target_wheat/wheat_train_val.shape[0]

_, wheat_train_val = train_test_split(wheat_train_val, test_size=split_per, 
                            stratify=wheat_train_val[['sowing_period', 'harvest_period']], random_state=0)

display(wheat_train_val.shape[0]/mustard_train_val.shape[0], 0.7/0.3)

# Creating train_val and test data sets
train_val = pd.concat([wheat_train_val, mustard_train_val], ignore_index=True)
test = pd.concat([wheat_test, mustard_test], ignore_index=True)

2.3328137178487918

2.3333333333333335

# Adding zeroes before Sowing_Period and after harvest

In [11]:
# Adding zeroes before SP and after harvest

def zeroes_bf_sp_af_hr(row):
    values = deepcopy(row)
    sowing_period = values['sowing_period'][-6:]
    harvest = values['harvest_period'][-6:]
    
    if (sowing_period != 'oct_2f'):
        sowing_period_idx = values.index.get_loc(sowing_period)
        strt_fn_bf_sp = values.index[sowing_period_idx-1]
        values.loc[:strt_fn_bf_sp] = len(values.loc[:strt_fn_bf_sp]) * [0]
        
    if (harvest not in {'mar_2f', 'apr_1f'}):    
        harvest_idx = values.index.get_loc(harvest)
        strt_fn_af_hr = values.index[harvest_idx+2]
        values.loc[strt_fn_af_hr:'apr_1f'] = len(values.loc[strt_fn_af_hr:'apr_1f']) * [0]
        
    return values

train_val = train_val.apply(zeroes_bf_sp_af_hr, axis=1)
test = test.apply(zeroes_bf_sp_af_hr, axis=1)

# Train_val Split

In [12]:
train, val = train_test_split(train_val, test_size=0.2, 
                                   stratify=train_val[['sowing_period', 'harvest_period']], random_state=0)

display(train.shape, val.shape, train.crop_name.value_counts(normalize=True), val.crop_name.value_counts(normalize=True))

(3420, 15)

(856, 15)

Wheat      0.697368
Mustard    0.302632
Name: crop_name, dtype: float64

Wheat      0.71028
Mustard    0.28972
Name: crop_name, dtype: float64

# Label Encoding

In [13]:
for df in train, val, test:
    df['crop_name'] = df.crop_name.apply(lambda crop: 1 if crop == 'Wheat' else 0)
    df.drop(['sowing_period', 'harvest_period'], axis=1, inplace=True)
    
train.head(3)

Unnamed: 0,oct_2f,nov_1f,nov_2f,dec_1f,dec_2f,jan_1f,jan_2f,feb_1f,feb_2f,mar_1f,mar_2f,apr_1f,crop_name
289,0,139,148,155,163,164.0,165,158,129,0,0,0,1
2788,0,0,0,138,158,167.0,177,181,181,170,136,0,1
46,0,0,126,145,175,180.0,182,181,172,160,148,119,1


# File Export

In [14]:
train.to_csv(r'data_files\data_share\train-1.csv', index=False)
test.to_csv(r'data_files\data_share\test.csv', index=False)
val.to_csv(r'data_files\data_share\val-1.csv', index=False)

In [15]:
train.crop_name.value_counts(), val.crop_name.value_counts(), test.crop_name.value_counts()

(1    2385
 0    1035
 Name: crop_name, dtype: int64,
 1    608
 0    248
 Name: crop_name, dtype: int64,
 1    5760
 0     700
 Name: crop_name, dtype: int64)