In [1]:
#importing the libraries from pyhton
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from numpy import mean
from numpy import std

#Importing sklearn libraries
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv("train.csv")
train.head(5)

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1


In [3]:
test = pd.read_csv("test.csv")
test.head(5)

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,F00000002,188,1,1,1,0,,0,2
1,F00000007,410,1,1,1,0,0.0,0,2
2,F00000011,626,1,0,1,0,0.0,0,2
3,F00000013,731,1,0,1,0,0.0,0,2
4,F00000014,789,0,0,1,0,0.0,0,1


In [4]:
train['Crop_Damage'].value_counts()

0    74238
1    12307
2     2313
Name: Crop_Damage, dtype: int64

In [5]:
#From the above output we can see that class 2 proportion is very low, it's clearly an imbalanced dataset
#We need to balance the dataset
print((train['Crop_Damage'].value_counts()/len(train))*100)

0    83.546783
1    13.850188
2     2.603030
Name: Crop_Damage, dtype: float64


In [6]:
train['Number_Weeks_Used'].fillna(train['Number_Weeks_Used'].mean(), inplace=True)

In [7]:
X = train.drop(['ID','Crop_Damage'],axis = 1)
y = train.Crop_Damage
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [8]:
# evaluate the model
model = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.847 (0.002)


In [9]:
# fit the model on the train dataset
model = LGBMClassifier()
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [10]:
y_pred = model.predict(X_test)

In [11]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

84.69052442043665

In [12]:
X_train.shape,y_train.shape

((66643, 8), (66643,))

In [17]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority',random_state=42)
X_sm,y_sm = sm.fit_resample(X_train,y_train)
print(X_sm.shape,y_sm.shape)

(120558, 8) (120558,)


In [18]:
model.fit(X_sm,y_sm)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [19]:
y_pred_new = model.predict(X_test)

In [20]:
accuracy = metrics.accuracy_score(y_test, y_pred_new)
accuracy_percentage = 100 * accuracy
accuracy_percentage

80.36461850101283

In [21]:
test['Number_Weeks_Used'].fillna(test['Number_Weeks_Used'].mean(), inplace=True)
test.isnull().sum()

ID                         0
Estimated_Insects_Count    0
Crop_Type                  0
Soil_Type                  0
Pesticide_Use_Category     0
Number_Doses_Week          0
Number_Weeks_Used          0
Number_Weeks_Quit          0
Season                     0
dtype: int64

In [22]:
new_test = test.drop('ID',axis = 1)
predictions = model.predict(new_test)
submission = pd.read_csv("sample_submissiom.csv")
submission.head(5)
submission['ID'] = test['ID']
submission['Crop_Damage'] = predictions
submission.to_csv('submission_LGBM_SMOTE.csv',index=False)