In [52]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
import warnings
warnings.filterwarnings("ignore")

In [53]:
#View and Load Dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age,Claim
0,2010,EPX,Travel Agency,Online,Cancellation Plan,61,PHILIPPINES,12.0,0.0,41,0
1,4245,EPX,Travel Agency,Online,Cancellation Plan,4,MALAYSIA,17.0,0.0,35,0
2,9251,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,26,THAILAND,19.8,11.88,47,0
3,4754,EPX,Travel Agency,Online,2 way Comprehensive Plan,15,HONG KONG,27.0,0.0,48,0
4,8840,EPX,Travel Agency,Online,2 way Comprehensive Plan,15,MALAYSIA,37.0,0.0,36,0


In [54]:
#Drop ID
df = df.drop(['ID'], 1)

In [55]:
#Drop duration less than 0
df = df[df['Duration'] >=0]

In [56]:
#Split data into target and features
X = df.drop(['Claim'], 1)
y = df['Claim']

In [57]:
#Split features into numerical and categorical features
num = X.select_dtypes(include = np.number)
cat = X.select_dtypes(exclude = np.number)

In [58]:
#Import and initialize standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [59]:
#Scale numerical features
num = pd.DataFrame(scaler.fit_transform(num), columns = list(num), index = num.index)

In [60]:
#One hot encode categorical features
cat = pd.get_dummies(cat, drop_first= True)

In [61]:
#Combine numerical and categorical features
X = pd.concat([num, cat], axis = 1)

In [62]:
#Split data into training and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.25, random_state = 42)

In [63]:
#Import and initialize logistic regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state = 42)

In [64]:
#Fit model on training data
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
#Make prediction using X_test
y_pred = log_reg.predict(X_test)

In [66]:
#Import accuracy metrics
from sklearn.metrics import classification_report

In [67]:
#View classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92     10902
           1       0.62      0.30      0.41      2175

    accuracy                           0.85     13077
   macro avg       0.75      0.63      0.66     13077
weighted avg       0.83      0.85      0.83     13077



In [68]:
#Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [69]:
#Determine Params to use for GridSearchCV
params = {'class_weight': ['balanced', None], 'C': np.arange(10,100,10)}

In [70]:
#Initialize GridSearchCV
log_reg_cv = GridSearchCV(estimator = log_reg, param_grid=params, cv = 10)

In [71]:
#Fit GridSearch on training data
log_reg_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([10, 20, 30, 40, 50, 60, 70, 80, 90]),
                         'class_weight': ['balanced', None]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [72]:
#View best estimator and save it as the model
print(log_reg_cv.best_estimator_)
log_reg = log_reg_cv.best_estimator_

LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [73]:
#Use model to predict X_test
y_pred = log_reg.predict(X_test)

In [74]:
#View classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92     10902
           1       0.63      0.32      0.42      2175

    accuracy                           0.85     13077
   macro avg       0.75      0.64      0.67     13077
weighted avg       0.83      0.85      0.83     13077



In [75]:
#Use feature Selection to improve Accuracy of predicting imcome '1'
#Use rfe
from sklearn.feature_selection import RFE

In [83]:
#Initialize rfe
rfe = RFE(n_features_to_select=50, estimator = log_reg)

In [84]:
#Fit rfe on Training data
rfe.fit(X_train, y_train)

RFE(estimator=LogisticRegression(C=20, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='warn', n_jobs=None, penalty='l2',
                                 random_state=42, solver='warn', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=50, step=1, verbose=0)

In [85]:
#View features selected by rfe
features = list(rfe.support_)
from itertools import compress
imp_features = list(compress(list(X_train), features))
print(imp_features)

['Agency_ART', 'Agency_C2B', 'Agency_CWT', 'Agency_EPX', 'Agency_JWT', 'Agency_KML', 'Agency_LWC', 'Agency_RAB', 'Agency_SSI', 'Distribution Channel_Online', 'Product Name_2 way Comprehensive Plan', 'Product Name_Basic Plan', 'Product Name_Individual Comprehensive Plan', 'Product Name_Rental Vehicle Excess Insurance', 'Destination_AUSTRIA', 'Destination_BAHRAIN', 'Destination_BANGLADESH', 'Destination_BELGIUM', 'Destination_BRAZIL', 'Destination_CAMBODIA', 'Destination_CROATIA', 'Destination_CYPRUS', 'Destination_CZECH REPUBLIC', 'Destination_DENMARK', 'Destination_EGYPT', 'Destination_FIJI', 'Destination_GUAM', 'Destination_HUNGARY', 'Destination_INDIA', 'Destination_IRELAND', 'Destination_ISRAEL', 'Destination_JORDAN', "Destination_LAO PEOPLE'S DEMOCRATIC REPUBLIC", 'Destination_MALDIVES', 'Destination_MALTA', 'Destination_MEXICO', 'Destination_MONGOLIA', 'Destination_MOROCCO', 'Destination_MYANMAR', 'Destination_PAKISTAN', 'Destination_PERU', 'Destination_POLAND', 'Destination_PORTU

In [86]:
#Filter X_train and X_test to have only important features
X_train_2 = X_train[imp_features]
X_test_2 = X_test[imp_features]

In [87]:
#Fit model on important features only
log_reg.fit(X_train_2, y_train)

LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [88]:
#Use the model to predict X_test_2
y_pred_2 = log_reg.predict(X_test_2)

In [89]:
#View classification report
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91     10902
           1       0.40      0.00      0.01      2175

    accuracy                           0.83     13077
   macro avg       0.62      0.50      0.46     13077
weighted avg       0.76      0.83      0.76     13077



In [90]:
#Using over sampling to improve precision of output '1'
from imblearn.over_sampling import SMOTE

In [91]:
#Initialize smote
smote = SMOTE(random_state = 42)

In [92]:
#Fit smote on data
smote.fit(X, y)

SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
      out_step='deprecated', random_state=42, ratio=None,
      sampling_strategy='auto', svm_estimator='deprecated')

In [93]:
#Define new feature and target variables
X_res, y_res = smote.fit_sample(X, y)
X_res = pd.DataFrame(X_res, columns = list(X))
y_res = pd.DataFrame(y_res, columns = ['income'])

In [94]:
#Remove unwanted features from over_sampled data
X_res = X_res[imp_features]

In [95]:
#Split oversampled data into training and test data
X_train, X_test, y_train, y_test = tts(X_res, y_res, test_size = 0.25, random_state = 42)

In [96]:
#Fit model on training data
log_reg.fit(X_train, y_train)

LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [97]:
#Make prediction using X_test_4
y_pred = log_reg.predict(X_test)

In [99]:
#View classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.86      0.78     10885
           1       0.82      0.64      0.72     10908

    accuracy                           0.75     21793
   macro avg       0.76      0.75      0.75     21793
weighted avg       0.76      0.75      0.75     21793

