In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [8]:
raw_train = pd.read_csv("train.csv")
df_train = raw_train.copy()

raw_test = pd.read_csv("test.csv")
df_test = raw_test.copy()

In [9]:
df_train['train_flag'] = 1
df_test['train_flag'] = 0
df_test['Crop_Damage'] = 0
print(df_train.shape, df_test.shape)



(88858, 11) (59310, 11)
(148168, 11)


In [19]:
trn = df_train.head(10)
tst = df_test.head(10)

df_data = pd.concat((trn, tst))
print(df_data.shape)

(20, 11)


In [21]:
df_data.reset_index(drop = True)

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag
0,F00000001,188,1,0,1,0,0.0,0,1,0,1
1,F00000003,209,1,0,1,0,0.0,0,2,1,1
2,F00000004,257,1,0,1,0,0.0,0,2,1,1
3,F00000005,257,1,1,1,0,0.0,0,2,1,1
4,F00000006,342,1,0,1,0,0.0,0,2,1,1
5,F00000008,448,0,1,1,0,,0,2,1,1
6,F00000009,448,0,1,1,0,,0,2,1,1
7,F00000010,577,1,0,1,0,0.0,0,1,2,1
8,F00000012,731,0,0,1,0,0.0,0,2,0,1
9,F00000020,1132,1,0,1,0,0.0,0,1,2,1


In [22]:
feature_cols = df_train.columns.tolist()
feature_cols.remove('ID')
feature_cols.remove('Crop_Damage')
feature_cols.remove('train_flag')
label_col = 'Crop_Damage'
print(feature_cols)

['Estimated_Insects_Count', 'Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit', 'Season']


In [23]:
df_data['ID_value'] = df_data['ID'].apply(lambda x: x.strip('F')).astype('int')

In [24]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag,ID_value
0,F00000001,188,1,0,1,0,0.0,0,1,0,1,1
1,F00000003,209,1,0,1,0,0.0,0,2,1,1,3
2,F00000004,257,1,0,1,0,0.0,0,2,1,1,4
3,F00000005,257,1,1,1,0,0.0,0,2,1,1,5
4,F00000006,342,1,0,1,0,0.0,0,2,1,1,6
5,F00000008,448,0,1,1,0,,0,2,1,1,8
6,F00000009,448,0,1,1,0,,0,2,1,1,9
7,F00000010,577,1,0,1,0,0.0,0,1,2,1,10
8,F00000012,731,0,0,1,0,0.0,0,2,0,1,12
9,F00000020,1132,1,0,1,0,0.0,0,1,2,1,20


In [27]:
df_data = df_data.sort_values(['ID_value'])
df_data = df_data.reset_index(drop=True)


In [28]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag,ID_value
0,F00000001,188,1,0,1,0,0.0,0,1,0,1,1
1,F00000002,188,1,1,1,0,,0,2,0,0,2
2,F00000003,209,1,0,1,0,0.0,0,2,1,1,3
3,F00000004,257,1,0,1,0,0.0,0,2,1,1,4
4,F00000005,257,1,1,1,0,0.0,0,2,1,1,5
5,F00000006,342,1,0,1,0,0.0,0,2,1,1,6
6,F00000007,410,1,1,1,0,0.0,0,2,0,0,7
7,F00000008,448,0,1,1,0,,0,2,1,1,8
8,F00000009,448,0,1,1,0,,0,2,1,1,9
9,F00000010,577,1,0,1,0,0.0,0,1,2,1,10


In [29]:
df_data.sort_values(['ID_value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values

array([-9.99000000e+02, -9.99000000e+02,  0.00000000e+00,  5.00000000e-01,
        0.00000000e+00,  6.66666667e-01,  5.00000000e-01,  3.33333333e-01,
        5.00000000e-01,  7.50000000e-01,  1.00000000e+00,  1.00000000e+00,
        8.00000000e-01,  6.00000000e-01,  6.00000000e-01,  4.00000000e-01,
        0.00000000e+00,  0.00000000e+00,  4.00000000e-01,  4.00000000e-01])

In [30]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag,ID_value
0,F00000001,188,1,0,1,0,0.0,0,1,0,1,1
1,F00000002,188,1,1,1,0,,0,2,0,0,2
2,F00000003,209,1,0,1,0,0.0,0,2,1,1,3
3,F00000004,257,1,0,1,0,0.0,0,2,1,1,4
4,F00000005,257,1,1,1,0,0.0,0,2,1,1,5
5,F00000006,342,1,0,1,0,0.0,0,2,1,1,6
6,F00000007,410,1,1,1,0,0.0,0,2,0,0,7
7,F00000008,448,0,1,1,0,,0,2,1,1,8
8,F00000009,448,0,1,1,0,,0,2,1,1,9
9,F00000010,577,1,0,1,0,0.0,0,1,2,1,10


In [46]:
df_data['Soil_Type_Damage'] = df_data.sort_values(['ID_value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Estimated_Insects_Count_Damage'] = df_data.sort_values(['ID_value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Crop_Type_Damage'] = df_data.sort_values(['ID_value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Pesticide_Use_Category_Damage'] = df_data.sort_values(['ID_value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Season_Damage'] = df_data.sort_values(['ID_value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Soil_Type_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Estimated_Insects_Count_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Crop_Type_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Pesticide_Use_Category_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Season_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

In [48]:
df_data.loc[df_data['train_flag'] == 0, 'Crop_Damage'] = -999

In [50]:
df_data['Crop_Damage_lag1'] = df_data['Crop_Damage'].shift(fill_value=-999)
df_data['Estimated_Insects_Count_lag1'] = df_data['Estimated_Insects_Count'].shift(fill_value=-999)
df_data['Crop_Type_lag1'] = df_data['Crop_Type'].shift(fill_value=-999)
df_data['Soil_Type_lag1'] = df_data['Soil_Type'].shift(fill_value=-999)
df_data['Pesticide_Use_Category_lag1'] = df_data['Pesticide_Use_Category'].shift(fill_value=-999)
df_data['Number_Doses_Week_lag1'] = df_data['Number_Doses_Week'].shift(fill_value=-999)
df_data['Number_Weeks_Used_lag1'] = df_data['Number_Weeks_Used'].shift(fill_value=-999)
df_data['Number_Weeks_Quit_lag1'] = df_data['Number_Weeks_Quit'].shift(fill_value=-999)
df_data['Season_lag1'] = df_data['Season'].shift(fill_value=-999)

df_data['Crop_Damage_lag2'] = df_data['Crop_Damage'].shift(periods=2,fill_value=-999)
df_data['Estimated_Insects_Count_lag2'] = df_data['Estimated_Insects_Count'].shift(periods=2,fill_value=-999)
df_data['Crop_Type_lag2'] = df_data['Crop_Type'].shift(fill_value=-999)
df_data['Soil_Type_lag2'] = df_data['Soil_Type'].shift(fill_value=-999)
df_data['Pesticide_Use_Category_lag2'] = df_data['Pesticide_Use_Category'].shift(periods=2,fill_value=-999)
df_data['Number_Doses_Week_lag2'] = df_data['Number_Doses_Week'].shift(periods=2,fill_value=-999)
df_data['Number_Weeks_Used_lag2'] = df_data['Number_Weeks_Used'].shift(periods=2,fill_value=-999)
df_data['Number_Weeks_Quit_lag2'] = df_data['Number_Weeks_Quit'].shift(periods=2,fill_value=-999)
df_data['Season_lag2'] = df_data['Season'].shift(periods=2,fill_value=-999)


In [51]:
df_data[['Soil_Type_Damage','Soil_Type', 'Crop_Damage']]

Unnamed: 0,Soil_Type_Damage,Soil_Type,Crop_Damage
0,-999.0,0,0
1,-999.0,1,-999
2,0.0,0,1
3,0.5,0,1
4,0.0,1,1
5,0.666667,0,1
6,0.5,1,-999
7,0.333333,1,1
8,0.5,1,1
9,0.75,0,2
