In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
import seaborn as sns
from collections import Counter
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 10)

In [2]:
df_train = pd.read_csv('train.csv', header=0)
df_test = pd.read_csv('test.csv', header=0)

In [3]:
df_train['train_flag'] = 1
df_test['train_flag'] = 0
df_test['Crop_Damage'] = 0
print(df_train.shape, df_test.shape)

df_data = pd.concat((df_train, df_test))
print(df_data.shape)

(88858, 11) (59310, 11)
(148168, 11)


In [4]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag
0,F00000001,188,1,0,1,0,0.0,0,1,0,1
1,F00000003,209,1,0,1,0,0.0,0,2,1,1
2,F00000004,257,1,0,1,0,0.0,0,2,1,1
3,F00000005,257,1,1,1,0,0.0,0,2,1,1
4,F00000006,342,1,0,1,0,0.0,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
59305,F00155937,3337,1,0,2,20,34.0,12,1,0,0
59306,F00155940,3516,1,0,2,20,32.0,10,2,0,0
59307,F00155941,3702,1,0,2,10,,48,1,0,0
59308,F00155943,3702,1,0,2,10,28.0,17,2,0,0


In [5]:
feature_cols = df_train.columns.tolist()
feature_cols.remove('ID')
feature_cols.remove('Crop_Damage')
feature_cols.remove('train_flag')
label_col = 'Crop_Damage'
print(feature_cols)

['Estimated_Insects_Count', 'Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit', 'Season']


In [6]:
df_data['ID_value'] = df_data['ID'].apply(lambda x: x.strip('F')).astype('int')

In [7]:
df_data = df_data.sort_values(['ID_value'])

In [8]:
df_data = df_data.reset_index(drop=True)

In [9]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,train_flag,ID_value
0,F00000001,188,1,0,1,0,0.0,0,1,0,1,1
1,F00000002,188,1,1,1,0,,0,2,0,0,2
2,F00000003,209,1,0,1,0,0.0,0,2,1,1,3
3,F00000004,257,1,0,1,0,0.0,0,2,1,1,4
4,F00000005,257,1,1,1,0,0.0,0,2,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
148163,F00155941,3702,1,0,2,10,,48,1,0,0,155941
148164,F00155942,3702,1,0,2,10,25.0,18,3,0,1,155942
148165,F00155943,3702,1,0,2,10,28.0,17,2,0,0,155943
148166,F00155944,3895,1,0,2,5,52.0,7,1,0,0,155944


In [10]:
df_data['Soil_Type_Damage'] = df_data.sort_values(['ID_value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Estimated_Insects_Count_Damage'] = df_data.sort_values(['ID_value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Crop_Type_Damage'] = df_data.sort_values(['ID_value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Pesticide_Use_Category_Damage'] = df_data.sort_values(['ID_value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Season_Damage'] = df_data.sort_values(['ID_value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values


df_data['Soil_Type_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Estimated_Insects_Count_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Crop_Type_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Pesticide_Use_Category_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

df_data['Season_Damage_lag2'] = df_data.sort_values(['ID_value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

In [11]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,...,Soil_Type_Damage,Estimated_Insects_Count_Damage,Crop_Type_Damage,Pesticide_Use_Category_Damage,Season_Damage,Soil_Type_Damage_lag2,Estimated_Insects_Count_Damage_lag2,Crop_Type_Damage_lag2,Pesticide_Use_Category_Damage_lag2,Season_Damage_lag2
0,F00000001,188,1,0,1,0,0.0,0,1,0,...,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.0
1,F00000002,188,1,1,1,0,,0,2,0,...,-999.0,0.0,0.000000,0.000000,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.0
2,F00000003,209,1,0,1,0,0.0,0,2,1,...,0.0,-999.0,0.000000,0.000000,0.000000,-999.0,-999.0,0.000000,0.000000,-999.0
3,F00000004,257,1,0,1,0,0.0,0,2,1,...,0.5,-999.0,0.333333,0.333333,0.500000,0.0,-999.0,0.000000,0.000000,0.0
4,F00000005,257,1,1,1,0,0.0,0,2,1,...,0.0,1.0,0.500000,0.500000,0.666667,-999.0,-999.0,0.333333,0.333333,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148163,F00155941,3702,1,0,2,10,,48,1,0,...,0.0,0.6,0.000000,0.000000,0.000000,0.0,0.6,0.000000,0.000000,0.0
148164,F00155942,3702,1,0,2,10,25.0,18,3,0,...,0.0,0.6,0.000000,0.000000,0.000000,0.0,0.6,0.000000,0.000000,0.0
148165,F00155943,3702,1,0,2,10,28.0,17,2,0,...,0.0,0.4,0.000000,0.000000,0.000000,0.0,0.6,0.000000,0.000000,0.0
148166,F00155944,3895,1,0,2,5,52.0,7,1,0,...,0.0,0.6,0.000000,0.000000,0.000000,0.0,0.4,0.000000,0.000000,0.0


In [12]:
df_data.loc[df_data['train_flag'] == 0, 'Crop_Damage'] = -999

In [13]:
df_data['Crop_Damage_lag1'] = df_data['Crop_Damage'].shift(fill_value=-999)
df_data['Estimated_Insects_Count_lag1'] = df_data['Estimated_Insects_Count'].shift(fill_value=-999)
df_data['Crop_Type_lag1'] = df_data['Crop_Type'].shift(fill_value=-999)
df_data['Soil_Type_lag1'] = df_data['Soil_Type'].shift(fill_value=-999)
df_data['Pesticide_Use_Category_lag1'] = df_data['Pesticide_Use_Category'].shift(fill_value=-999)
df_data['Number_Doses_Week_lag1'] = df_data['Number_Doses_Week'].shift(fill_value=-999)
df_data['Number_Weeks_Used_lag1'] = df_data['Number_Weeks_Used'].shift(fill_value=-999)
df_data['Number_Weeks_Quit_lag1'] = df_data['Number_Weeks_Quit'].shift(fill_value=-999)
df_data['Season_lag1'] = df_data['Season'].shift(fill_value=-999)

df_data['Crop_Damage_lag2'] = df_data['Crop_Damage'].shift(periods=2,fill_value=-999)
df_data['Estimated_Insects_Count_lag2'] = df_data['Estimated_Insects_Count'].shift(periods=2,fill_value=-999)
df_data['Crop_Type_lag2'] = df_data['Crop_Type'].shift(fill_value=-999)
df_data['Soil_Type_lag2'] = df_data['Soil_Type'].shift(fill_value=-999)
df_data['Pesticide_Use_Category_lag2'] = df_data['Pesticide_Use_Category'].shift(periods=2,fill_value=-999)
df_data['Number_Doses_Week_lag2'] = df_data['Number_Doses_Week'].shift(periods=2,fill_value=-999)
df_data['Number_Weeks_Used_lag2'] = df_data['Number_Weeks_Used'].shift(periods=2,fill_value=-999)
df_data['Number_Weeks_Quit_lag2'] = df_data['Number_Weeks_Quit'].shift(periods=2,fill_value=-999)
df_data['Season_lag2'] = df_data['Season'].shift(periods=2,fill_value=-999)


In [14]:
df_data

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,...,Season_lag1,Crop_Damage_lag2,Estimated_Insects_Count_lag2,Crop_Type_lag2,Soil_Type_lag2,Pesticide_Use_Category_lag2,Number_Doses_Week_lag2,Number_Weeks_Used_lag2,Number_Weeks_Quit_lag2,Season_lag2
0,F00000001,188,1,0,1,0,0.0,0,1,0,...,-999,-999,-999,-999,-999,-999,-999,-999.0,-999,-999
1,F00000002,188,1,1,1,0,,0,2,-999,...,1,-999,-999,1,0,-999,-999,-999.0,-999,-999
2,F00000003,209,1,0,1,0,0.0,0,2,1,...,2,0,188,1,1,1,0,0.0,0,1
3,F00000004,257,1,0,1,0,0.0,0,2,1,...,2,-999,188,1,0,1,0,,0,2
4,F00000005,257,1,1,1,0,0.0,0,2,1,...,2,1,209,1,0,1,0,0.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148163,F00155941,3702,1,0,2,10,,48,1,-999,...,2,0,3516,1,0,2,15,40.0,8,2
148164,F00155942,3702,1,0,2,10,25.0,18,3,0,...,1,-999,3516,1,0,2,20,32.0,10,2
148165,F00155943,3702,1,0,2,10,28.0,17,2,-999,...,3,-999,3702,1,0,2,10,,48,1
148166,F00155944,3895,1,0,2,5,52.0,7,1,-999,...,2,0,3702,1,0,2,10,25.0,18,3


In [15]:
df_train, df_test = df_data[df_data.train_flag == 1], df_data[df_data.train_flag == 0]

In [16]:
df_train.drop(['train_flag'], inplace=True, axis=1)
df_test.drop(['train_flag'], inplace=True, axis=1)
df_test.drop([label_col], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
print(df_train.shape, df_test.shape)

(88858, 39) (59310, 38)


In [18]:
del df_data

In [19]:
missing_impute = -999

In [20]:
df_train['Number_Weeks_Used'] = df_train['Number_Weeks_Used'].apply(lambda x: missing_impute if pd.isna(x) else x)
df_test['Number_Weeks_Used'] = df_test['Number_Weeks_Used'].apply(lambda x: missing_impute if pd.isna(x) else x)

df_train['Number_Weeks_Used_lag1'] = df_train['Number_Weeks_Used_lag1'].apply(lambda x: missing_impute if pd.isna(x) else x)
df_test['Number_Weeks_Used_lag1'] = df_test['Number_Weeks_Used_lag1'].apply(lambda x: missing_impute if pd.isna(x) else x)

df_train['Number_Weeks_Used_lag2'] = df_train['Number_Weeks_Used_lag2'].apply(lambda x: missing_impute if pd.isna(x) else x)
df_test['Number_Weeks_Used_lag2'] = df_test['Number_Weeks_Used_lag2'].apply(lambda x: missing_impute if pd.isna(x) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [21]:
df_train, df_eval = train_test_split(df_train, test_size=0.40, random_state=42, shuffle=True, stratify=df_train[label_col])

In [22]:
feature_cols = df_train.columns.tolist()
feature_cols.remove('ID')
feature_cols.remove('Crop_Damage')
feature_cols.remove('ID_value')
label_col = 'Crop_Damage'
print(feature_cols)

['Estimated_Insects_Count', 'Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit', 'Season', 'Soil_Type_Damage', 'Estimated_Insects_Count_Damage', 'Crop_Type_Damage', 'Pesticide_Use_Category_Damage', 'Season_Damage', 'Soil_Type_Damage_lag2', 'Estimated_Insects_Count_Damage_lag2', 'Crop_Type_Damage_lag2', 'Pesticide_Use_Category_Damage_lag2', 'Season_Damage_lag2', 'Crop_Damage_lag1', 'Estimated_Insects_Count_lag1', 'Crop_Type_lag1', 'Soil_Type_lag1', 'Pesticide_Use_Category_lag1', 'Number_Doses_Week_lag1', 'Number_Weeks_Used_lag1', 'Number_Weeks_Quit_lag1', 'Season_lag1', 'Crop_Damage_lag2', 'Estimated_Insects_Count_lag2', 'Crop_Type_lag2', 'Soil_Type_lag2', 'Pesticide_Use_Category_lag2', 'Number_Doses_Week_lag2', 'Number_Weeks_Used_lag2', 'Number_Weeks_Quit_lag2', 'Season_lag2']


In [23]:
cat_cols = ['Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Season', 'Crop_Type_lag1', 'Soil_Type_lag1', 'Pesticide_Use_Category_lag1', 'Season_lag1']

In [24]:
import time
from sklearn.metrics import accuracy_score,f1_score,log_loss,precision_score,recall_score

REPORT = pd.DataFrame(columns={"Model","accuracy_score","f1_score","precision_score","recall_score"})
TIME_REPORT = pd.DataFrame(columns={"Model","Alive_0","Damage_Due_To_other_Reasons1","Damage_Due_To_pesticide_2"})
def time_report(model,name):
    
    start = time.time()
    model.predict(np.reshape(list(df_eval.loc[8331,feature_cols]), (1,-1)))
    end = time.time()
    A = end - start
    
    start = time.time()
    model.predict(np.reshape(list(df_eval.loc[3450,feature_cols]), (1,-1)))
    end = time.time()
    B = end - start
    
    start = time.time()
    model.predict(np.reshape(list(df_eval.loc[17828,feature_cols]), (1,-1)))
    end = time.time()
    C = end - start
    
    global TIME_REPORT
    TIME_REPORT = TIME_REPORT.append(
        [{
            "Model":name,
            "Alive_0":A,
            "Damage_Due_To_other_Reasons1":B,
            "Damage_Due_To_pesticide_2":C,
            }],
        ignore_index=True)
    
    
def prepare_Report(traind,testd,measure):
    A = accuracy_score(traind,testd)
    F = f1_score(traind,testd,average='macro')
    P = precision_score(traind,testd,average='macro')
    R = recall_score(traind,testd,average='macro')
    global REPORT
    REPORT = REPORT.append(
        [{
            "Model":measure,
            "accuracy_score":A,
            "f1_score":F,
            "precision_score":P,
            "recall_score":R}],
        ignore_index=True)

## LGBM 

In [25]:
params = {}
params['learning_rate'] = 0.04
params['max_depth'] = 18
params['n_estimators'] = 3000
params['objective'] = 'multiclass'
params['boosting_type'] = 'gbdt'
params['subsample'] = 0.7
params['random_state'] = 42
params['colsample_bytree']=0.7
params['min_data_in_leaf'] = 55
params['reg_alpha'] = 1.7
params['reg_lambda'] = 1.11
params['class_weight']: {0: 0.44, 1: 0.4, 2: 0.37}

In [26]:
clf = lgb.LGBMClassifier(**params)
    
clf.fit(df_train[feature_cols], df_train[label_col], early_stopping_rounds=100, eval_set=[ (df_eval[feature_cols], df_eval[label_col])], eval_metric='multi_error', verbose=True, categorical_feature=cat_cols)

eval_score = accuracy_score(df_eval[label_col], clf.predict(df_eval[feature_cols]))

print('Eval ACC: {}'.format(eval_score))

New categorical_feature is ['Crop_Type', 'Crop_Type_lag1', 'Pesticide_Use_Category', 'Pesticide_Use_Category_lag1', 'Season', 'Season_lag1', 'Soil_Type', 'Soil_Type_lag1']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))






[1]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.490077
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.46577
[3]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.445455
[4]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.43065
[5]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.415111
[6]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.40321
[7]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.393554
[8]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.381564
[9]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.371223
[10]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.361934
[11]	valid_0's multi_error: 0.164528	valid_0's multi_logloss: 0.353592
[12]	valid_0's multi_error: 0.159352	valid_0's multi_logloss: 0.347116
[13]	valid_0's multi_error: 0.150377	valid_0's multi_logloss: 0.339334
[14]	valid_0's multi_error:

[115]	valid_0's multi_error: 0.0634706	valid_0's multi_logloss: 0.183271
[116]	valid_0's multi_error: 0.0633018	valid_0's multi_logloss: 0.183005
[117]	valid_0's multi_error: 0.0632456	valid_0's multi_logloss: 0.182711
[118]	valid_0's multi_error: 0.0629923	valid_0's multi_logloss: 0.182377
[119]	valid_0's multi_error: 0.0630205	valid_0's multi_logloss: 0.182111
[120]	valid_0's multi_error: 0.0628798	valid_0's multi_logloss: 0.181735
[121]	valid_0's multi_error: 0.0628517	valid_0's multi_logloss: 0.181515
[122]	valid_0's multi_error: 0.0627673	valid_0's multi_logloss: 0.18128
[123]	valid_0's multi_error: 0.0625703	valid_0's multi_logloss: 0.180972
[124]	valid_0's multi_error: 0.0625703	valid_0's multi_logloss: 0.180634
[125]	valid_0's multi_error: 0.0625703	valid_0's multi_logloss: 0.180447
[126]	valid_0's multi_error: 0.0623171	valid_0's multi_logloss: 0.180115
[127]	valid_0's multi_error: 0.0622609	valid_0's multi_logloss: 0.179963
[128]	valid_0's multi_error: 0.0622609	valid_0's mul

[228]	valid_0's multi_error: 0.0575062	valid_0's multi_logloss: 0.165682
[229]	valid_0's multi_error: 0.0574781	valid_0's multi_logloss: 0.16559
[230]	valid_0's multi_error: 0.0575343	valid_0's multi_logloss: 0.165587
[231]	valid_0's multi_error: 0.0573937	valid_0's multi_logloss: 0.165275
[232]	valid_0's multi_error: 0.0574218	valid_0's multi_logloss: 0.165277
[233]	valid_0's multi_error: 0.0574218	valid_0's multi_logloss: 0.165234
[234]	valid_0's multi_error: 0.0573937	valid_0's multi_logloss: 0.165197
[235]	valid_0's multi_error: 0.0573655	valid_0's multi_logloss: 0.165161
[236]	valid_0's multi_error: 0.0573655	valid_0's multi_logloss: 0.165112
[237]	valid_0's multi_error: 0.0573093	valid_0's multi_logloss: 0.165009
[238]	valid_0's multi_error: 0.0573093	valid_0's multi_logloss: 0.164972
[239]	valid_0's multi_error: 0.0572248	valid_0's multi_logloss: 0.164927
[240]	valid_0's multi_error: 0.0570842	valid_0's multi_logloss: 0.164791
[241]	valid_0's multi_error: 0.0572248	valid_0's mul

[344]	valid_0's multi_error: 0.0555368	valid_0's multi_logloss: 0.158896
[345]	valid_0's multi_error: 0.0554805	valid_0's multi_logloss: 0.158863
[346]	valid_0's multi_error: 0.0554805	valid_0's multi_logloss: 0.158874
[347]	valid_0's multi_error: 0.0555368	valid_0's multi_logloss: 0.158861
[348]	valid_0's multi_error: 0.0554524	valid_0's multi_logloss: 0.158855
[349]	valid_0's multi_error: 0.0553961	valid_0's multi_logloss: 0.15879
[350]	valid_0's multi_error: 0.0553961	valid_0's multi_logloss: 0.158729
[351]	valid_0's multi_error: 0.0553961	valid_0's multi_logloss: 0.158691
[352]	valid_0's multi_error: 0.0553961	valid_0's multi_logloss: 0.158688
[353]	valid_0's multi_error: 0.0554524	valid_0's multi_logloss: 0.158657
[354]	valid_0's multi_error: 0.0554805	valid_0's multi_logloss: 0.158647
[355]	valid_0's multi_error: 0.0554805	valid_0's multi_logloss: 0.158599
[356]	valid_0's multi_error: 0.0554805	valid_0's multi_logloss: 0.158553
[357]	valid_0's multi_error: 0.0554524	valid_0's mul

[460]	valid_0's multi_error: 0.0537643	valid_0's multi_logloss: 0.154048
[461]	valid_0's multi_error: 0.0538488	valid_0's multi_logloss: 0.154037
[462]	valid_0's multi_error: 0.0537081	valid_0's multi_logloss: 0.154012
[463]	valid_0's multi_error: 0.0536518	valid_0's multi_logloss: 0.154011
[464]	valid_0's multi_error: 0.0536518	valid_0's multi_logloss: 0.154018
[465]	valid_0's multi_error: 0.0536237	valid_0's multi_logloss: 0.153952
[466]	valid_0's multi_error: 0.0535393	valid_0's multi_logloss: 0.153918
[467]	valid_0's multi_error: 0.0535955	valid_0's multi_logloss: 0.153884
[468]	valid_0's multi_error: 0.0535393	valid_0's multi_logloss: 0.153622
[469]	valid_0's multi_error: 0.0533986	valid_0's multi_logloss: 0.153601
[470]	valid_0's multi_error: 0.0533705	valid_0's multi_logloss: 0.153567
[471]	valid_0's multi_error: 0.0533705	valid_0's multi_logloss: 0.153574
[472]	valid_0's multi_error: 0.0533423	valid_0's multi_logloss: 0.153576
[473]	valid_0's multi_error: 0.0533705	valid_0's mu

[576]	valid_0's multi_error: 0.0522451	valid_0's multi_logloss: 0.151066
[577]	valid_0's multi_error: 0.0523014	valid_0's multi_logloss: 0.151061
[578]	valid_0's multi_error: 0.0523295	valid_0's multi_logloss: 0.150977
[579]	valid_0's multi_error: 0.0523295	valid_0's multi_logloss: 0.150967
[580]	valid_0's multi_error: 0.0523295	valid_0's multi_logloss: 0.150937
[581]	valid_0's multi_error: 0.0523295	valid_0's multi_logloss: 0.150935
[582]	valid_0's multi_error: 0.0522732	valid_0's multi_logloss: 0.150916
[583]	valid_0's multi_error: 0.0523014	valid_0's multi_logloss: 0.150913
[584]	valid_0's multi_error: 0.0522732	valid_0's multi_logloss: 0.150886
[585]	valid_0's multi_error: 0.0523014	valid_0's multi_logloss: 0.150851
[586]	valid_0's multi_error: 0.0522732	valid_0's multi_logloss: 0.150827
[587]	valid_0's multi_error: 0.0523014	valid_0's multi_logloss: 0.150835
[588]	valid_0's multi_error: 0.052217	valid_0's multi_logloss: 0.150645
[589]	valid_0's multi_error: 0.0519638	valid_0's mul

[690]	valid_0's multi_error: 0.0518231	valid_0's multi_logloss: 0.14926
[691]	valid_0's multi_error: 0.051795	valid_0's multi_logloss: 0.149256
[692]	valid_0's multi_error: 0.0517106	valid_0's multi_logloss: 0.149242
[693]	valid_0's multi_error: 0.0516824	valid_0's multi_logloss: 0.14923
[694]	valid_0's multi_error: 0.0517106	valid_0's multi_logloss: 0.149192
[695]	valid_0's multi_error: 0.0517387	valid_0's multi_logloss: 0.149189
[696]	valid_0's multi_error: 0.0517387	valid_0's multi_logloss: 0.149172
[697]	valid_0's multi_error: 0.0517668	valid_0's multi_logloss: 0.149163
[698]	valid_0's multi_error: 0.0517106	valid_0's multi_logloss: 0.149135
[699]	valid_0's multi_error: 0.0517387	valid_0's multi_logloss: 0.149129
[700]	valid_0's multi_error: 0.051795	valid_0's multi_logloss: 0.149129
[701]	valid_0's multi_error: 0.0518512	valid_0's multi_logloss: 0.149126
[702]	valid_0's multi_error: 0.0519356	valid_0's multi_logloss: 0.149128
[703]	valid_0's multi_error: 0.0519075	valid_0's multi_

[806]	valid_0's multi_error: 0.0506133	valid_0's multi_logloss: 0.147411
[807]	valid_0's multi_error: 0.0505852	valid_0's multi_logloss: 0.147397
[808]	valid_0's multi_error: 0.0505289	valid_0's multi_logloss: 0.147402
[809]	valid_0's multi_error: 0.0505289	valid_0's multi_logloss: 0.147406
[810]	valid_0's multi_error: 0.0505289	valid_0's multi_logloss: 0.147353
[811]	valid_0's multi_error: 0.0505008	valid_0's multi_logloss: 0.147267
[812]	valid_0's multi_error: 0.0505571	valid_0's multi_logloss: 0.147278
[813]	valid_0's multi_error: 0.0504727	valid_0's multi_logloss: 0.147278
[814]	valid_0's multi_error: 0.0504727	valid_0's multi_logloss: 0.147262
[815]	valid_0's multi_error: 0.0504727	valid_0's multi_logloss: 0.147219
[816]	valid_0's multi_error: 0.0504164	valid_0's multi_logloss: 0.147219
[817]	valid_0's multi_error: 0.0504445	valid_0's multi_logloss: 0.147193
[818]	valid_0's multi_error: 0.0503883	valid_0's multi_logloss: 0.147184
[819]	valid_0's multi_error: 0.0504164	valid_0's mu

[920]	valid_0's multi_error: 0.0496005	valid_0's multi_logloss: 0.145713
[921]	valid_0's multi_error: 0.0495724	valid_0's multi_logloss: 0.145718
[922]	valid_0's multi_error: 0.0495442	valid_0's multi_logloss: 0.145714
[923]	valid_0's multi_error: 0.0496286	valid_0's multi_logloss: 0.145714
[924]	valid_0's multi_error: 0.0496849	valid_0's multi_logloss: 0.145718
[925]	valid_0's multi_error: 0.0496568	valid_0's multi_logloss: 0.145715
[926]	valid_0's multi_error: 0.0497412	valid_0's multi_logloss: 0.145714
[927]	valid_0's multi_error: 0.0497412	valid_0's multi_logloss: 0.145723
[928]	valid_0's multi_error: 0.0497412	valid_0's multi_logloss: 0.145733
[929]	valid_0's multi_error: 0.049713	valid_0's multi_logloss: 0.145736
[930]	valid_0's multi_error: 0.0496286	valid_0's multi_logloss: 0.14574
[931]	valid_0's multi_error: 0.0496568	valid_0's multi_logloss: 0.145742
[932]	valid_0's multi_error: 0.0496286	valid_0's multi_logloss: 0.145748
[933]	valid_0's multi_error: 0.0496005	valid_0's mult

[1034]	valid_0's multi_error: 0.0491785	valid_0's multi_logloss: 0.145517
[1035]	valid_0's multi_error: 0.0491222	valid_0's multi_logloss: 0.14552
[1036]	valid_0's multi_error: 0.0491222	valid_0's multi_logloss: 0.14553
[1037]	valid_0's multi_error: 0.0491222	valid_0's multi_logloss: 0.145538
[1038]	valid_0's multi_error: 0.0491503	valid_0's multi_logloss: 0.145545
[1039]	valid_0's multi_error: 0.0491222	valid_0's multi_logloss: 0.145542
[1040]	valid_0's multi_error: 0.0490941	valid_0's multi_logloss: 0.145544
[1041]	valid_0's multi_error: 0.0490941	valid_0's multi_logloss: 0.145547
[1042]	valid_0's multi_error: 0.0491503	valid_0's multi_logloss: 0.145528
[1043]	valid_0's multi_error: 0.0491503	valid_0's multi_logloss: 0.145535
[1044]	valid_0's multi_error: 0.0491503	valid_0's multi_logloss: 0.145544
[1045]	valid_0's multi_error: 0.0491503	valid_0's multi_logloss: 0.145544
[1046]	valid_0's multi_error: 0.0491785	valid_0's multi_logloss: 0.14553
[1047]	valid_0's multi_error: 0.0491785	v

[1145]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.14498
[1146]	valid_0's multi_error: 0.0490659	valid_0's multi_logloss: 0.144988
[1147]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.14495
[1148]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.144943
[1149]	valid_0's multi_error: 0.0490659	valid_0's multi_logloss: 0.144946
[1150]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.144952
[1151]	valid_0's multi_error: 0.0490659	valid_0's multi_logloss: 0.144953
[1152]	valid_0's multi_error: 0.0490659	valid_0's multi_logloss: 0.144945
[1153]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.144891
[1154]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.144887
[1155]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.144894
[1156]	valid_0's multi_error: 0.0490378	valid_0's multi_logloss: 0.14488
[1157]	valid_0's multi_error: 0.0490659	valid_0's multi_logloss: 0.144874
[1158]	valid_0's multi_error: 0.0489815	v

[1257]	valid_0's multi_error: 0.0493473	valid_0's multi_logloss: 0.144955
[1258]	valid_0's multi_error: 0.049291	valid_0's multi_logloss: 0.144921
[1259]	valid_0's multi_error: 0.0492629	valid_0's multi_logloss: 0.14491
Early stopping, best iteration is:
[1159]	valid_0's multi_error: 0.0489253	valid_0's multi_logloss: 0.144884
Eval ACC: 0.9510747242853927


In [27]:
best_iter = clf.best_iteration_
params['n_estimators'] = best_iter
print(params)

{'learning_rate': 0.04, 'max_depth': 18, 'n_estimators': 1159, 'objective': 'multiclass', 'boosting_type': 'gbdt', 'subsample': 0.7, 'random_state': 42, 'colsample_bytree': 0.7, 'min_data_in_leaf': 55, 'reg_alpha': 1.7, 'reg_lambda': 1.11}


In [28]:
df_train = pd.concat((df_train, df_eval))

In [29]:
clf = lgb.LGBMClassifier(**params)

clf.fit(df_train[feature_cols], df_train[label_col], eval_metric='multi_error', verbose=False, categorical_feature=cat_cols)

# eval_score_auc = roc_auc_score(df_train[label_col], clf.predict(df_train[feature_cols]))
# eval_score = accuracy_score(df_eval[label_col], clf.predict(df_eval[feature_cols]))

prepare_Report(df_eval[label_col],clf.predict(df_eval[feature_cols]),measure ="LBGM")
time_report(clf,"LGBM")

# print('ACC: {}'.format(eval_score_acc))



In [30]:
#0 = 8331   1 = 3450  2 = 17828
df_eval.loc[df_eval[label_col] == 2]

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage,...,Season_lag1,Crop_Damage_lag2,Estimated_Insects_Count_lag2,Crop_Type_lag2,Soil_Type_lag2,Pesticide_Use_Category_lag2,Number_Doses_Week_lag2,Number_Weeks_Used_lag2,Number_Weeks_Quit_lag2,Season_lag2
145141,F00152756,3337,1,0,3,20,48.0,0,2,2,...,2,-999,3336,1,0,3,20,52.0,0,2
89983,F00094722,984,1,1,3,40,45.0,0,1,2,...,3,0,984,1,1,3,45,31.0,0,2
17828,F00018775,916,1,1,3,40,37.0,0,2,2,...,1,1,915,1,1,3,20,26.0,0,1
90836,F00095608,1478,0,0,2,20,-999.0,4,2,2,...,2,-999,1478,0,0,2,40,29.0,13,2
63817,F00067182,677,0,1,3,20,36.0,0,3,2,...,3,1,677,0,1,3,20,-999.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48690,F00051302,2999,0,0,2,20,50.0,4,1,2,...,1,-999,2999,0,0,2,20,50.0,9,1
131773,F00138699,2267,0,1,3,20,54.0,0,2,2,...,1,1,2267,0,1,3,30,53.0,0,2
110541,F00116351,1296,0,1,3,40,40.0,0,2,2,...,1,-999,1297,0,1,3,20,39.0,0,1
113460,F00119413,1898,0,1,2,40,34.0,11,1,2,...,2,1,1898,0,1,2,20,34.0,16,1


In [31]:
df_eval.loc[17828,label_col]

2

In [32]:
dt = np.reshape(list(df_eval.loc[17828,feature_cols]), (1,-1))
dt

array([[ 9.16e+02,  1.00e+00,  1.00e+00,  3.00e+00,  4.00e+01,  3.70e+01,
         0.00e+00,  2.00e+00,  6.00e-01,  2.00e-01,  4.00e-01,  4.00e-01,
         2.00e-01,  8.00e-01,  2.00e-01,  4.00e-01,  4.00e-01,  0.00e+00,
        -9.99e+02,  9.16e+02,  1.00e+00,  1.00e+00,  3.00e+00,  5.00e+01,
         3.90e+01,  0.00e+00,  1.00e+00,  1.00e+00,  9.15e+02,  1.00e+00,
         1.00e+00,  3.00e+00,  2.00e+01,  2.60e+01,  0.00e+00,  1.00e+00]])

In [33]:
# data =list(df_eval.loc[17828,label_col]).reshape(1,-1)
preds = clf.predict(dt)


In [34]:
Counter(df_train['Crop_Damage'])

Counter({0: 74238, 1: 12307, 2: 2313})

In [35]:
Counter(preds)

Counter({2: 1})

In [36]:
submission = pd.DataFrame({'ID':df_test['ID'], 'Crop_Damage':preds})

ValueError: array length 1 does not match index length 59310

In [None]:
plt.rcParams['figure.figsize'] = (12, 6)
lgb.plot_importance(clf)
plt.show()

## Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
clfRF = RandomForestClassifier(max_depth=2, random_state=0)
clfRF.fit(df_train[feature_cols], df_train[label_col])


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [38]:
prepare_Report(df_eval[label_col],clfRF.predict(df_eval[feature_cols]),measure ="Random Forest")
time_report(clfRF,"Random Forest")

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
eval_score_acc = accuracy_score(df_eval[label_col], clfRF.predict(df_eval[feature_cols]))
print('ACC: {}'.format(eval_score_acc))

ACC: 0.8354715282466801


## XGBOOST

In [40]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(df_train[feature_cols], df_train[label_col])
# b = 
# eval_score_acc = accuracy_score(df_eval[label_col], b)
# print('ACC: {}'.format(eval_score_acc))





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=2, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [41]:
prepare_Report(df_eval[label_col],model.predict(df_eval[feature_cols]),measure ="xgboost")
# time_report(model,"xgboost")

In [None]:
start = time.time()
# np.reshape(list(df_eval.loc[8331,feature_cols]), (1,-1))
model.predict(df_eval.loc[8331,feature_cols])
end = time.time()
A = end - start

start = time.time()
model.predict(np.reshape(list(df_eval.loc[3450,feature_cols]), (1,-1)))
end = time.time()
B = end - start

start = time.time()
model.predict(np.reshape(list(df_eval.loc[17828,feature_cols]), (1,-1)))
end = time.time()
C = end - start

TIME_REPORT = TIME_REPORT.append(
    [{
        "Model":"xgboost",
        "Alive_0":A,
        "Damage_Due_To_other_Reasons1":B,
        "Damage_Due_To_pesticide_2":C,
        }],
    ignore_index=True)

## SVM

In [42]:
from sklearn import svm

#Create a svm Classifier
clfsvm = svm.SVC() # Linear Kernel

#Train the model using the training sets
clfsvm.fit(df_train[feature_cols], df_train[label_col])

#Predict the response for test dataset
# eval_score_acc = accuracy_score(df_eval[label_col], clf.predict(df_eval[feature_cols]))
# print('ACC: {}'.format(eval_score_acc))

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
prepare_Report(df_eval[label_col],clfsvm.predict(df_eval[feature_cols]),measure ="SVM")
time_report(clfsvm,"SVM")

  _warn_prf(average, modifier, msg_start, len(result))


## Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#Train the model using the training sets
gnb.fit(df_train[feature_cols], df_train[label_col])

#Predict the response for test dataset
eval_score_acc = accuracy_score(df_eval[label_col], gnb.predict(df_eval[feature_cols]))
print('ACC: {}'.format(eval_score_acc))

ACC: 0.045605446770200314


In [45]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()
#Train the model using the training sets
gnb.fit(df_train[feature_cols], df_train[label_col])

#Predict the response for test dataset
eval_score_acc = accuracy_score(df_eval[label_col], gnb.predict(df_eval[feature_cols]))
print('ACC: {}'.format(eval_score_acc))

ACC: 0.7154794058068873


In [46]:
prepare_Report(df_eval[label_col],gnb.predict(df_eval[feature_cols]),measure ="Naive Bayes")
time_report(gnb,"Naive Bayes")

## knn

In [47]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(df_train[feature_cols], df_train[label_col])
eval_score_acc = accuracy_score(df_eval[label_col], neigh.predict(df_eval[feature_cols]))
print('ACC: {}'.format(eval_score_acc))

ACC: 0.9147535449020932


In [48]:
prepare_Report(df_eval[label_col],neigh.predict(df_eval[feature_cols]),measure ="kNN")
time_report(neigh,"kNN")

In [49]:
TIME_REPORT

Unnamed: 0,Damage_Due_To_pesticide_2,Damage_Due_To_other_Reasons1,Model,Alive_0
0,0.0,0.0,LGBM,0.015625
1,0.015627,0.015624,Random Forest,0.031249
2,0.010242,0.004068,SVM,0.006997
3,0.0,0.0,Naive Bayes,0.015622
4,0.0,0.0,kNN,0.0


In [50]:
REPORT

Unnamed: 0,Model,accuracy_score,precision_score,f1_score,recall_score
0,LBGM,0.977155,0.964732,0.875336,0.82776
1,Random Forest,0.835472,0.278491,0.303454,0.333333
2,xgboost,0.969531,0.941907,0.812949,0.76614
3,SVM,0.835472,0.278491,0.303454,0.333333
4,Naive Bayes,0.715479,0.442842,0.448859,0.555625
5,kNN,0.914754,0.843498,0.658742,0.597566


---

In [None]:
TIME_REPORT.to_csv('tr.csv')
REPORT.to_csv('r.csv')