> In this notebook, wheat data is undersampled such that ratio of wheat/mustard is atleast 70:30. 

# Import Modules and Data

In [1]:
from glob import glob
import geopandas as gp
import numpy as np
import pandas as pd
from copy import deepcopy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
import random
import os

pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings("ignore")

try:
    if kernel_is_loaded:
        pass
except:
    os.chdir('/'.join(os.getcwd().split('/')[:-1]))
    kernel_is_loaded = True

In [15]:
# Importing all the required files
wheat_train_val = pd.read_csv(r'data_files\data_share\preprocessed_wheat.csv')
mustard_train_val = pd.read_csv(r'data_files\data_share\preprocessed_mustard.csv')

wheat_test = pd.read_csv(r'data_files/data_share/preprocessed_new_wheat.csv')
mustard_test = pd.read_csv(r'data_files/data_share/preprocessed_new_mustard.csv')

# Dropping the unnecessary features as of now
wheat_train_val = wheat_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)
mustard_train_val = mustard_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)

# Undersampling - (70:30)

In [16]:
# Not using the combinations, where the count is just 1

display(pd.crosstab(wheat_train_val['sowing_period'], wheat_train_val['harvest_period'], margins=True))
display(pd.crosstab(mustard_train_val['sowing_period'], mustard_train_val['harvest_period'], margins=True))

harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dec_1f,0,0,0,2033,3112,5145
dec_2f,0,0,0,19,166,185
nov_1f,118,1433,1,1954,181,3687
nov_2f,10,1082,0,4505,1761,7358
oct_2f,59,68,1,28,12,168
All,187,2583,2,8539,5232,16543


harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nov_1f,34,296,1,375,18,724
nov_2f,2,4,0,13,1,20
oct_2f,148,319,11,60,3,541
All,184,619,12,448,22,1285


In [17]:
#Removing 'Wheat_oct_2f - Wheat_jan_2f' and 'Wheat_nov_1f - Wheat_jan_2f' combinations as they are just 1 in count and cant be used for stratification

wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'nov_1f') | (wheat_train_val.harvest_period != 'jan_2f')]
wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'oct_2f') | (wheat_train_val.harvest_period != 'jan_2f')]                                 

#Removing 'Mustard_nov_1f - Mustard_jan_2f' and 'Mustard_nov_2f - Mustard_mar_2f' combinations as they are just 1 in count and cant be used for stratification
mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_1f') | (mustard_train_val.harvest_period != 'jan_2f')]
mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_2f') | (mustard_train_val.harvest_period != 'mar_2f')]          


In [18]:
target_wheat = int(mustard_train_val.shape[0] * 0.7/0.3)

split_per = target_wheat/wheat_train_val.shape[0]

_, wheat_train_val = train_test_split(wheat_train_val, test_size=split_per, 
                            stratify=wheat_train_val[['sowing_period', 'harvest_period']], random_state=0)

display(wheat_train_val.shape[0]/mustard_train_val.shape[0], 0.7/0.3)

# Creating train_val and test data sets
train_val = pd.concat([wheat_train_val, mustard_train_val], ignore_index=True)
test = pd.concat([wheat_test, mustard_test], ignore_index=True)

2.3328137178487918

2.3333333333333335

# Train_val Split

In [20]:
train, val = train_test_split(train_val, test_size=0.2, 
                                   stratify=train_val[['sowing_period', 'harvest_period']], random_state=0)

display(train.shape, val.shape, train.crop_name.value_counts(normalize=True), val.crop_name.value_counts(normalize=True))

(3420, 15)

(856, 15)

Wheat      0.697368
Mustard    0.302632
Name: crop_name, dtype: float64

Wheat      0.71028
Mustard    0.28972
Name: crop_name, dtype: float64

# Label Encoding

In [21]:
for df in train, val, test:
    df['crop_name'] = df.crop_name.apply(lambda crop: 1 if crop == 'Wheat' else 0)
    df.drop(['sowing_period', 'harvest_period'], axis=1, inplace=True)
    
train.head(3)

Unnamed: 0,oct_2f,nov_1f,nov_2f,dec_1f,dec_2f,jan_1f,jan_2f,feb_1f,feb_2f,mar_1f,mar_2f,apr_1f,crop_name
289,138,139,148,155,163,164.0,165,158,129,124,0,0,1
2788,171,162,153,138,158,167.0,177,181,181,170,136,138,1
46,160,142,126,145,175,180.0,182,181,172,160,148,119,1


# File Export

In [22]:
train.to_csv(r'data_files\data_share\train-3.csv', index=False)
test.to_csv(r'data_files\data_share\test-3.csv', index=False)
val.to_csv(r'data_files\data_share\val-3.csv', index=False)

In [23]:
train.crop_name.value_counts(), val.crop_name.value_counts(), test.crop_name.value_counts()

(1    2385
 0    1035
 Name: crop_name, dtype: int64,
 1    608
 0    248
 Name: crop_name, dtype: int64,
 1    5760
 0     700
 Name: crop_name, dtype: int64)