In [1]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts, GridSearchCV, RandomizedSearchCV, cross_val_score

In [2]:
#Load and view dataset
df = pd.read_csv('train (le).csv')
df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age,Claim
0,2010,7,1,1,10,61,68,12.0,0.0,41,0
1,4245,7,1,1,10,4,53,17.0,0.0,35,0
2,9251,6,1,1,16,26,84,19.8,11.88,47,0
3,4754,7,1,1,1,15,33,27.0,0.0,48,0
4,8840,7,1,1,1,15,53,37.0,0.0,36,0


In [3]:
#Drop ID column
df = df.drop(['ID'], 1)

In [4]:
#Split data into features and target
X = df.drop(['Claim'], 1)
y = df['Claim']

In [5]:
#Split data into training and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.25, random_state = 42)

In [15]:
#Initialize random forest model
rfc = RandomForestClassifier(n_estimators = 1000, bootstrap = True, oob_score = True, random_state = 42,
                             class_weight = 'balanced', max_depth = 40, min_samples_split = 0.05)

In [16]:
#Fit model on training data
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=40, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=0.05, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=None, oob_score=True,
                       random_state=42, verbose=0, warm_start=False)

In [17]:
#Make prediction using X_test
y_pred = rfc.predict(X_test)

In [18]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86     10902
           1       0.43      0.80      0.55      2175

    accuracy                           0.79     13077
   macro avg       0.69      0.79      0.71     13077
weighted avg       0.86      0.79      0.81     13077



In [10]:
#View feature importances
feature_importance = pd.DataFrame(list(rfc.feature_importances_), columns = ['feature importance'])
features = pd.DataFrame(list(X), columns = ['Features'])
best_features = features.join(feature_importance)
best_features = best_features.sort_values(by = 'feature importance', ascending = False)
print(best_features)

               Features  feature importance
6             Net Sales            0.223496
4              Duration            0.182329
0                Agency            0.143048
7  Commision (in value)            0.132045
8                   Age            0.118434
5           Destination            0.107067
3          Product Name            0.049326
1           Agency Type            0.041752
2  Distribution Channel            0.002504


In [11]:
#Drop unimportant features
X_train = X_train.drop(['Distribution Channel', 'Agency Type', 'Product Name'], 1)
X_test = X_test.drop(['Distribution Channel', 'Agency Type', 'Product Name'], 1)
X = X.drop(['Distribution Channel', 'Agency Type', 'Product Name'], 1)

In [12]:
#Fit model on training data
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=True,
                       random_state=42, verbose=0, warm_start=False)

In [13]:
#Make prediction using X_test
y_pred = rfc.predict(X_test)

In [14]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     10902
           1       0.84      0.77      0.80      2175

    accuracy                           0.94     13077
   macro avg       0.90      0.87      0.88     13077
weighted avg       0.94      0.94      0.94     13077



In [15]:
#Define params to be used in RandomSearchCV
params = {'criterion': ['entropy', 'gini'], 'max_depth': np.arange(2, 50, 2), 'min_samples_split': np.arange(0.005, 0.1, 0.005)}

In [16]:
#Initialize RandomSearchCV and fit training data on it
rfc_cv = RandomizedSearchCV(estimator = rfc, param_distributions=params, cv = 10, random_state = 42, refit= True)
rfc_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight='balanced',
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=

In [17]:
#Make prediction using X_test
y_pred = rfc_cv.predict(X_test)

In [18]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90     10902
           1       0.52      0.86      0.65      2175

    accuracy                           0.85     13077
   macro avg       0.75      0.85      0.78     13077
weighted avg       0.89      0.85      0.86     13077



In [None]:
#Fit model on whole dataset
rfc.fit(X, y)

In [None]:
#Loadand view validation set
validation = pd.read_csv('test (final).csv')
validation.head()

In [None]:
#Drop ID column from validation set and unimportant features
validation_new = validation.drop(['ID', 'Distribution Channel', 'Agency Type', 'Product Name'], 1)

In [None]:
#Use model to predict on validation set
validation_pred = rfc.predict(validation_new)

In [None]:
#Combine ID and validation Pred
ID = pd.DataFrame(validation['ID'], columns =['ID'])
claim = pd.DataFrame(validation_pred, columns = ['Claim'])
submission = ID.join(claim)
print(submission)

In [None]:
#Save submission
submission.to_csv('submission 3.csv', index = False)