In [12]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as tts, GridSearchCV

In [3]:
#Load and view dataset
df = pd.read_csv('train (le).csv')
df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age,Claim
0,2010,7,1,1,10,61,68,12.0,0.0,41,0
1,4245,7,1,1,10,4,53,17.0,0.0,35,0
2,9251,6,1,1,16,26,84,19.8,11.88,47,0
3,4754,7,1,1,1,15,33,27.0,0.0,48,0
4,8840,7,1,1,1,15,53,37.0,0.0,36,0


In [6]:
#Drop ID column
df = df.drop(['ID'], 1)

In [7]:
#Split data into features and target
X = df.drop(['Claim'], 1)
y = df['Claim']

In [8]:
#Split data into training and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.25, random_state = 42)

In [9]:
#Initialize decision tree model
dtc = DecisionTreeClassifier(random_state = 42)

In [10]:
#Fit model on training data
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [11]:
#Make prediction using X_test
y_pred = dtc.predict(X_test)

In [14]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     10902
           1       0.77      0.76      0.77      2175

    accuracy                           0.92     13077
   macro avg       0.86      0.86      0.86     13077
weighted avg       0.92      0.92      0.92     13077



In [19]:
#View feature importance
feature_importance = pd.DataFrame(dtc.feature_importances_, columns = ['Feature Importance'])
features = pd.DataFrame(list(X), columns = ['Features'])
best_features = features.join(feature_importance)
best_features = best_features.sort_values(by = 'Feature Importance', ascending = False)
print(best_features)

               Features  Feature Importance
6             Net Sales            0.251672
0                Agency            0.202565
4              Duration            0.191518
8                   Age            0.139477
7  Commision (in value)            0.121887
5           Destination            0.057640
3          Product Name            0.034395
2  Distribution Channel            0.000465
1           Agency Type            0.000381


In [20]:
#Drop unimportant features
X_train = X_train.drop(['Distribution Channel', 'Agency Type'], 1)
X_test = X_test.drop(['Distribution Channel', 'Agency Type'], 1)

In [22]:
#Fit model on training data
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [23]:
#Make prediction using X_test
y_pred = dtc.predict(X_test)

In [24]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     10902
           1       0.77      0.76      0.76      2175

    accuracy                           0.92     13077
   macro avg       0.86      0.86      0.86     13077
weighted avg       0.92      0.92      0.92     13077



In [43]:
#Define params to be used in GridSearchCV
params = {'criterion': ['entropy', 'gini'], 'max_depth': np.arange(2, 20, 1), 'min_samples_split': np.arange(0.01, 0.1, 0.01),
         'class_weight': ['balanced']}

In [26]:
#Initialize GridSearchCV
dtc_cv = GridSearchCV(estimator = dtc, param_grid=params, cv = 10)

In [27]:
#Fit gridsearch on training data
dtc_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                 

In [28]:
#View best estimator and save it as model
print(dtc_cv.best_estimator_)
dtc = dtc_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=18,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.01,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


In [29]:
#Make prediction using X_test
y_pred = dtc.predict(X_test)

In [30]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93     10902
           1       0.67      0.54      0.59      2175

    accuracy                           0.88     13077
   macro avg       0.79      0.74      0.76     13077
weighted avg       0.87      0.88      0.87     13077



In [31]:
#Import SMOTE for over_sampling
from imblearn.over_sampling import SMOTE

In [34]:
#Specify class to be oversampled
classes = {1: 16000}

In [35]:
#Initialize smote
smote = SMOTE(classes, random_state = 42)

In [36]:
#Define new feature and target variables
X_res, y_res = smote.fit_sample(X, y)
X_res = pd.DataFrame(X_res, columns = list(X))
y_res = pd.DataFrame(y_res, columns = ['Claim'])

In [37]:
#Split data into training and test data
X_train, X_test, y_train, y_test = tts(X_res, y_res, test_size = 0.25, random_state = 42)

In [38]:
#Drop unimportant features
X_train = X_train.drop(['Distribution Channel', 'Agency Type'], 1)
X_test = X_test.drop(['Distribution Channel', 'Agency Type'], 1)

In [44]:
#Fit model on training data
dtc_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                 

In [45]:
#View best estimator and save it as model
print(dtc_cv.best_estimator_)
dtc = dtc_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=19,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.01,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


In [46]:
#Make prediction using X_test
y_pred = dtc.predict(X_test)

In [47]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90     10886
           1       0.73      0.68      0.71      4011

    accuracy                           0.85     14897
   macro avg       0.81      0.79      0.80     14897
weighted avg       0.84      0.85      0.85     14897



In [58]:
#Loadand view validation set
validation = pd.read_csv('test (final).csv')
validation.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age
0,17631,7,1,1,10,192,33,18.0,0.0,36
1,15064,7,1,1,0,2,75,20.0,0.0,36
2,14139,2,0,1,9,13,75,13.5,3.38,24
3,19754,7,1,1,1,133,82,41.0,0.0,36
4,16439,2,0,1,17,2,75,30.0,7.5,32


In [59]:
#Drop ID column from validation set and unimportant features
validation_new = validation.drop(['ID', 'Distribution Channel', 'Agency Type'], 1)

In [60]:
#Use model to predict on validation set
validation_pred = dtc.predict(validation_new)

In [67]:
#Combine ID and validation Pred
ID = pd.DataFrame(validation['ID'], columns =['ID'])
claim = pd.DataFrame(validation_pred, columns = ['Claim'])
submission = ID.join(claim)
print(submission)

          ID  Claim
0      17631      0
1      15064      0
2      14139      0
3      19754      0
4      16439      0
5      12394      0
6      12499      0
7      13938      1
8      11747      0
9      14015      0
10     16239      0
11     17897      1
12     13275      0
13     15284      0
14     15264      0
15     17319      0
16     14148      0
17     17123      0
18     17231      0
19     15347      0
20     18325      0
21     14133      0
22     18615      0
23     12932      0
24     14196      0
25     15726      1
26     11001      0
27     18925      0
28     11604      0
29     19111      0
...      ...    ...
22391  12092      0
22392  17399      0
22393  14827      0
22394  16423      1
22395  17265      0
22396  14687      0
22397  13393      0
22398  16251      0
22399  14729      0
22400  12126      0
22401  13485      0
22402  13868      0
22403  11691      0
22404  15257      0
22405  16471      0
22406  17778      0
22407  19322      0
22408  14480      0


In [68]:
#Save submission
submission.to_csv('submission 2.csv', index = False)