# Yacht Insurance Claims Data 
##### NOTEBOOK 3

**Problem Statement:** What is the likelihood that a yacht insurance policy has at least 1 claim within five years?

**Contents:**

___
## Import libraries and read in data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.linear_model import LogisticRegression, ElasticNetCV, LogisticRegressionCV
from sklearn.inspection import permutation_importance
from matplotlib import pyplot

In [2]:
np.random.seed(42)

In [3]:
combined = pd.read_csv('../datasets/combined2.csv')

combined.head()

Unnamed: 0,Years Exp.,Year Built,Length,Hull Limit,# Engines,num_claims,Age,policy_length,New/Renl/Endt/Canc/Flat_endt,New/Renl/Endt/Canc/Flat_endt-canc,...,Mooring County_sarasota,Mooring County_sinaloa,Mooring County_skagit,Mooring County_sonora,Mooring County_south pacific,Mooring County_st. johns,Mooring County_st. lucie,Mooring County_ventura,Mooring County_volusia,Mooring County_whatcom
0,2.0,1997.0,63.0,500000.0,2.0,0.0,73,1758.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22.0,2006.0,61.0,1275000.0,2.0,0.0,69,1771.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30.0,2001.0,48.0,400000.0,2.0,0.0,78,1759.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,1973.0,32.0,35000.0,0.0,0.0,44,1759.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30.0,1989.0,43.0,200000.0,1.0,0.0,70,1756.0,0,0,...,0,0,0,0,0,0,0,0,0,0


---

# MODELING: Multiclass Classification

**BELOW:** Initially we wanted to see if we could do a multiclass classification model to predict whether a boat might have 0, 1, 2, or 3 claims. Unfortunately, with so few examples for our models to train on for 2 or 3 claims, we decided to move forward with just binary (having 0 or at least 1 claim).

#### Establish a baseline

0 claims = 92%<br>
1 claim = 6.6%<br>
2 claims = 1%<br>
3 claims = 0.2%<br>

In [4]:
combined['num_claims'].value_counts()

0.0    5836
1.0     421
2.0      68
3.0      15
Name: num_claims, dtype: int64

In [5]:
combined['num_claims'].value_counts(normalize=True)

0.0    0.920505
1.0    0.066404
2.0    0.010726
3.0    0.002366
Name: num_claims, dtype: float64

In [6]:
combined['num_claims'].astype(int)

0       0
1       0
2       0
3       0
4       0
       ..
6335    0
6336    0
6337    0
6338    0
6339    0
Name: num_claims, Length: 6340, dtype: int64

### Train/Test split

In [7]:
X = combined.drop(columns=['num_claims'])
y = combined['num_claims']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

## 4 MODELS: KNN, Logistic Regression, Random Forest, Extra Trees
---

### StandardScaler

In [8]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Instantiate and fit 4 models

In [9]:
knn = KNeighborsClassifier()
knn.fit(X_train_sc, y_train)
knn_pred = knn.predict(X_test_sc)

lr = LogisticRegression(max_iter=500,random_state=42)
lr.fit(X_train_sc, y_train)
lr_pred = lr.predict(X_test_sc)

rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train_sc, y_train)
rf_pred = rf.predict(X_test_sc)

et = ExtraTreesClassifier(n_estimators=100,random_state=42)
et.fit(X_train_sc, y_train)
et_pred = et.predict(X_test_sc)

### Get model results

In [10]:
print(classification_report(y_test, knn_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.927     0.987     0.956      1167
         1.0      0.278     0.060     0.098        84
         2.0      0.400     0.143     0.211        14
         3.0      1.000     0.667     0.800         3

    accuracy                          0.916      1268
   macro avg      0.651     0.464     0.516      1268
weighted avg      0.878     0.916     0.891      1268



In [11]:
print(classification_report(y_test, lr_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.923     1.000     0.960      1167
         1.0      0.000     0.000     0.000        84
         2.0      0.000     0.000     0.000        14
         3.0      1.000     1.000     1.000         3

    accuracy                          0.923      1268
   macro avg      0.481     0.500     0.490      1268
weighted avg      0.852     0.923     0.886      1268



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print(classification_report(y_test, rf_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.939     0.995     0.966      1167
         1.0      0.600     0.107     0.182        84
         2.0      1.000     1.000     1.000        14
         3.0      1.000     1.000     1.000         3

    accuracy                          0.936      1268
   macro avg      0.885     0.776     0.787      1268
weighted avg      0.918     0.936     0.915      1268



In [13]:
print(classification_report(y_test, et_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.942     0.985     0.963      1167
         1.0      0.429     0.143     0.214        84
         2.0      0.875     1.000     0.933        14
         3.0      1.000     1.000     1.000         3

    accuracy                          0.930      1268
   macro avg      0.811     0.782     0.778      1268
weighted avg      0.907     0.930     0.913      1268



**Interpretation:** Looking at these scores we can tell that the model is overfitting to 2 and 3 claims. The models are very good at identifying these minority classes but that's because 

---
# MODELING: Binary Classification

**Set up for all models**<br>
1. Create binary class column<br>
2. Define X, y<br>
3. Scale X, y<br>
4. Train, test, split<br>

**Normal Modeling**<br>
1. Test models (KNN, Random Forest,  ExtraTrees, Logistic Regression, LinearSVM)<br>
2. Get micro-f1 scores for each model and add to a table to compare<br>

**With OverSampling**<br>
1. Instantiate RandomOverSampler<br>
2. Fit training data on oversampler<br>
3. Test same models w/ same parameters<br>
4. Get f1 scores and add to a table to compare<br>

**With OverSampling and Undersampling**<br>
1. Instantiate RandomOverSampler, fit.
2. Instantiate UnderOverSampler, fit.
3. Test same models w/ same parameters<br>
4. Get f1 scores and add to a table to compare<br>

**With SMOTE**<br>
1. Instantiate SMOTE, fit.<br>
2. Test same models w/ same parameters<br>
3. Get f1 scores and add to a table to compare<br>

**BINARY CLASSIFICATION**

#### Establish a baseline

0 claims = 92%<br>
At least 1 claim = 7.9%<br>

In [14]:
binary_scores = pd.DataFrame(columns=['Accuracy', 'Recall','Weighted F1 Score'])
binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score


In [15]:
combined['binary'] = [1 if x > 0 else 0 for x in combined['num_claims']]
combined['binary'].value_counts(normalize=True)

0    0.920505
1    0.079495
Name: binary, dtype: float64

### Train/test split

In [16]:
X = combined.drop(columns=['num_claims','binary'])
y = combined['binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

### Scale the data

In [17]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## 4 BASIC MODELS: KNN, Logistic Regression, Random Forest, Extra Trees
---

In [18]:
knn = KNeighborsClassifier()
knn.fit(X_train_sc, y_train)
knn_pred = knn.predict(X_test_sc)

lr = LogisticRegression(random_state=42)
lr.fit(X_train_sc, y_train)
lr_pred = lr.predict(X_test_sc)

rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train_sc, y_train)
rf_pred = rf.predict(X_test_sc)

et = ExtraTreesClassifier(n_estimators=100,random_state=42)
et.fit(X_train_sc, y_train)
et_pred = et.predict(X_test_sc)

In [19]:
# print('KNN Results')
# print(accuracy_score(y_test, knn_pred))
# print(recall_score(y_test, knn_pred))
# print(f1_score(y_test, knn_pred ,average='weighted'))
# print(classification_report(y_test, knn_pred))

In [20]:
regular_knn = pd.Series(data=[accuracy_score(y_test, knn_pred), recall_score(y_test, knn_pred),
                              f1_score(y_test, knn_pred ,average='weighted')], index=binary_scores.columns, name = 'KNN(plain)')

regular_lr = pd.Series(data=[accuracy_score(y_test, lr_pred), recall_score(y_test, lr_pred),
                              f1_score(y_test, lr_pred ,average='weighted')], index=binary_scores.columns, name = 'LR(plain)')

regular_rf = pd.Series(data=[accuracy_score(y_test, rf_pred), recall_score(y_test, rf_pred),
                              f1_score(y_test, rf_pred ,average='weighted')], index=binary_scores.columns, name = 'RF(plain)')

regular_et = pd.Series(data=[accuracy_score(y_test, et_pred), recall_score(y_test, et_pred),
                              f1_score(y_test, et_pred ,average='weighted')], index=binary_scores.columns, name = 'ET(plain)')

binary_scores = binary_scores.append([regular_knn, regular_lr, regular_rf, regular_et])

binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(plain),0.911672,0.128713,0.892369
LR(plain),0.919558,0.0,0.881779
RF(plain),0.934543,0.227723,0.917014
ET(plain),0.926656,0.237624,0.911725


## WITH OVERSAMPLING

### Check the imbalance of the two classes

In [21]:
counter = Counter(y_train)
counter

Counter({0: 4669, 1: 403})

### Instantiate RandomOverSampler

In [22]:
# ref: https://beckernick.github.io/oversampling-modeling/

over = RandomOverSampler(sampling_strategy=0.2, random_state=42)
X_over, y_over = over.fit_resample(X_train_sc,y_train)

In [23]:
over_counter = Counter(y_over)
over_counter

Counter({0: 4669, 1: 933})

### Fit the models

In [24]:
knn = KNeighborsClassifier()
knn.fit(X_over, y_over)
knn_pred = knn.predict(X_test_sc)

lr = LogisticRegression(random_state=42)
lr.fit(X_over, y_over)
lr_pred = lr.predict(X_test_sc)

rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_over, y_over)
rf_pred = rf.predict(X_test_sc)

et = ExtraTreesClassifier(n_estimators=100,random_state=42)
et.fit(X_over, y_over)
et_pred = et.predict(X_test_sc)

### Add the results

In [25]:
over_knn = pd.Series(data=[accuracy_score(y_test, knn_pred), recall_score(y_test, knn_pred),
                              f1_score(y_test, knn_pred ,average='weighted')], index=binary_scores.columns, name = 'KNN(oversample)')

over_lr = pd.Series(data=[accuracy_score(y_test, lr_pred), recall_score(y_test, lr_pred),
                              f1_score(y_test, lr_pred ,average='weighted')], index=binary_scores.columns, name = 'LR(oversample)')

over_rf = pd.Series(data=[accuracy_score(y_test, rf_pred), recall_score(y_test, rf_pred),
                              f1_score(y_test, rf_pred ,average='weighted')], index=binary_scores.columns, name = 'RF(oversample)')

over_et = pd.Series(data=[accuracy_score(y_test, et_pred), recall_score(y_test, et_pred),
                              f1_score(y_test, et_pred ,average='weighted')], index=binary_scores.columns, name = 'ET(oversample)')

binary_scores = binary_scores.append([over_knn, over_lr, over_rf, over_et])

binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(plain),0.911672,0.128713,0.892369
LR(plain),0.919558,0.0,0.881779
RF(plain),0.934543,0.227723,0.917014
ET(plain),0.926656,0.237624,0.911725
KNN(oversample),0.864353,0.346535,0.874381
LR(oversample),0.897476,0.059406,0.877101
RF(oversample),0.932965,0.257426,0.917971
ET(oversample),0.925868,0.247525,0.911837


### With SMOTE Oversampling and Random Under Sampling

### Set up a pipeline with SMOTE and undersampling

In [26]:
# ref: https://pypi.org/project/imbalanced-learn/
# ref: https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

over = SMOTE(sampling_strategy=0.1,random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5,random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [27]:
X_sm_und, y_sm_und = pipeline.fit_resample(X_train_sc, y_train)

In [28]:
counter_5 = Counter(y_sm_und)
counter_5

Counter({0: 932, 1: 466})

### Fit the models

In [29]:
knn = KNeighborsClassifier()
knn.fit(X_sm_und, y_sm_und)
knn_pred = knn.predict(X_test_sc)

lr = LogisticRegression(random_state=42)
lr.fit(X_sm_und, y_sm_und)
lr_pred = lr.predict(X_test_sc)

rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_sm_und, y_sm_und)
rf_pred = rf.predict(X_test_sc)

et = ExtraTreesClassifier(n_estimators=100,random_state=42)
et.fit(X_sm_und, y_sm_und)
et_pred = et.predict(X_test_sc)

### Add the results

In [30]:
smote_knn = pd.Series(data=[accuracy_score(y_test, knn_pred), recall_score(y_test, knn_pred),
                              f1_score(y_test, knn_pred ,average='weighted')], index=binary_scores.columns, name = 'KNN(SMOTE/Under)')

smote_lr = pd.Series(data=[accuracy_score(y_test, lr_pred), recall_score(y_test, lr_pred),
                              f1_score(y_test, lr_pred ,average='weighted')], index=binary_scores.columns, name = 'LR(SMOTE/Under)')

smote_rf = pd.Series(data=[accuracy_score(y_test, rf_pred), recall_score(y_test, rf_pred),
                              f1_score(y_test, rf_pred ,average='weighted')], index=binary_scores.columns, name = 'RF(LR(SMOTE/Under))')

smote_et = pd.Series(data=[accuracy_score(y_test, et_pred), recall_score(y_test, et_pred),
                              f1_score(y_test, et_pred ,average='weighted')], index=binary_scores.columns, name = 'ET(LR(SMOTE/Under))')

binary_scores = binary_scores.append([smote_knn, smote_lr, smote_rf, smote_et])

binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(plain),0.911672,0.128713,0.892369
LR(plain),0.919558,0.0,0.881779
RF(plain),0.934543,0.227723,0.917014
ET(plain),0.926656,0.237624,0.911725
KNN(oversample),0.864353,0.346535,0.874381
LR(oversample),0.897476,0.059406,0.877101
RF(oversample),0.932965,0.257426,0.917971
ET(oversample),0.925868,0.247525,0.911837
KNN(SMOTE/Under),0.766562,0.376238,0.810732
LR(SMOTE/Under),0.803628,0.277228,0.832253


**INTERPRETATION:** It looks like my best models use a combination of oversampling using SMOTE and Random Under Sampling.<br>
Specifically, Extra Trees using this combination was the best predictor of our minority class (highest recall score).<br>
*Reminder Baseline:<br> 
0 claims = 92%<br>
At least 1 claim = 7.9%<br>*

---

## Look at training/testing scores of models

In [31]:
print('KNN(SMOTE/UNDER) Training Results:')
print(knn.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, knn.predict(X_sm_und)))

print('\nRF(SMOTE/UNDER) Testing Results:')
print(knn.score(X_test_sc, y_test))
print(recall_score(y_test, knn.predict(X_test_sc)))

KNN(SMOTE/UNDER) Training Results:
0.8054363376251789
0.6738197424892703

RF(SMOTE/UNDER) Testing Results:
0.7665615141955836
0.37623762376237624


In [32]:
print('LR(SMOTE/UNDER) Training Results:')
print(lr.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, lr.predict(X_sm_und)))

print('\nLR(SMOTE/UNDER) Testing Results:')
print(lr.score(X_test_sc, y_test))
print(recall_score(y_test, lr.predict(X_test_sc)))

LR(SMOTE/UNDER) Training Results:
0.7467811158798283
0.48068669527896996

LR(SMOTE/UNDER) Testing Results:
0.8036277602523659
0.27722772277227725


In [33]:
print('RF(SMOTE/UNDER) Training Results:')
print(rf.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, rf.predict(X_sm_und)))

print('\nRF(SMOTE/UNDER) Testing Results:')
print(rf.score(X_test_sc, y_test))
print(recall_score(y_test, rf.predict(X_test_sc)))

RF(SMOTE/UNDER) Training Results:
1.0
1.0

RF(SMOTE/UNDER) Testing Results:
0.8982649842271293
0.42574257425742573


In [34]:
et1_training_score = et.score(X_sm_und, y_sm_und)
et1_recall_training_score = recall_score(y_sm_und, et.predict(X_sm_und))

et1_testing_score = et.score(X_test_sc, y_test)
et1_recall_testing_score = recall_score(y_test, et.predict(X_test_sc)) 

print('ET(SMOTE/UNDER) FIRST Training Results:')
print(et1_training_score)
print(et1_recall_training_score)

print('\nET(SMOTE/UNDER) FIRST Testing Results:')
print(et1_testing_score)
print(et1_recall_testing_score)

ET(SMOTE/UNDER) FIRST Training Results:
1.0
1.0

ET(SMOTE/UNDER) FIRST Testing Results:
0.8659305993690851
0.37623762376237624


In [35]:
et.predict(X_test_sc)[:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [36]:
et.predict(X_test_sc)[:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [37]:
et.predict_proba(X_test_sc)[:50]

array([[0.73, 0.27],
       [0.68, 0.32],
       [0.98, 0.02],
       [1.  , 0.  ],
       [0.93, 0.07],
       [0.72, 0.28],
       [1.  , 0.  ],
       [0.89, 0.11],
       [0.75, 0.25],
       [0.96, 0.04],
       [0.93, 0.07],
       [0.94, 0.06],
       [0.73, 0.27],
       [0.81, 0.19],
       [0.7 , 0.3 ],
       [0.93, 0.07],
       [0.35, 0.65],
       [0.66, 0.34],
       [0.92, 0.08],
       [0.79, 0.21],
       [0.8 , 0.2 ],
       [0.85, 0.15],
       [0.71, 0.29],
       [0.96, 0.04],
       [0.02, 0.98],
       [0.91, 0.09],
       [0.98, 0.02],
       [0.74, 0.26],
       [0.64, 0.36],
       [0.91, 0.09],
       [0.88, 0.12],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.72, 0.28],
       [0.52, 0.48],
       [0.94, 0.06],
       [0.55, 0.45],
       [0.65, 0.35],
       [0.73, 0.27],
       [0.58, 0.42],
       [0.92, 0.08],
       [0.63, 0.37],
       [0.84, 0.16],
       [0.9 , 0.1 ],
       [0.93, 0.07],
       [0.93, 0.07],
       [0.95, 0.05],
       [0.71,

In [38]:
binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(plain),0.911672,0.128713,0.892369
LR(plain),0.919558,0.0,0.881779
RF(plain),0.934543,0.227723,0.917014
ET(plain),0.926656,0.237624,0.911725
KNN(oversample),0.864353,0.346535,0.874381
LR(oversample),0.897476,0.059406,0.877101
RF(oversample),0.932965,0.257426,0.917971
ET(oversample),0.925868,0.247525,0.911837
KNN(SMOTE/Under),0.766562,0.376238,0.810732
LR(SMOTE/Under),0.803628,0.277228,0.832253


## Logistic Regression with Regularization and SMOTE/Undersampling

*I hadn't included any regularization in my logistic regression model and wanted to see how it did.*

In [39]:
logreg_cv = LogisticRegressionCV(Cs=10, cv=5, penalty="l1", solver="liblinear", random_state=42)
logreg_cv.fit(X_sm_und, y_sm_und)
logreg_cv_pred = logreg_cv.predict(X_test_sc)

print('LRCV(SMOTE/UNDER) Training Results:')
print(logreg_cv.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, logreg_cv.predict(X_sm_und)))

print('\nLRCV(SMOTE/UNDER) Testing Results:')
print(logreg_cv.score(X_test_sc, y_test))
print(recall_score(y_test, logreg_cv.predict(X_test_sc)))



LRCV(SMOTE/UNDER) Training Results:
0.7453505007153076
0.45493562231759654

LRCV(SMOTE/UNDER) Testing Results:
0.8225552050473186
0.26732673267326734


In [40]:
# Adding results to the results table
logregcv_et = pd.Series(data=[accuracy_score(y_test, logreg_cv_pred), recall_score(y_test, logreg_cv_pred),
                              f1_score(y_test, logreg_cv_pred ,average='weighted')], index=binary_scores.columns, name = 'LRCV(SMOTE/Under)')

binary_scores = binary_scores.append([logregcv_et])

binary_scores

Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(plain),0.911672,0.128713,0.892369
LR(plain),0.919558,0.0,0.881779
RF(plain),0.934543,0.227723,0.917014
ET(plain),0.926656,0.237624,0.911725
KNN(oversample),0.864353,0.346535,0.874381
LR(oversample),0.897476,0.059406,0.877101
RF(oversample),0.932965,0.257426,0.917971
ET(oversample),0.925868,0.247525,0.911837
KNN(SMOTE/Under),0.766562,0.376238,0.810732
LR(SMOTE/Under),0.803628,0.277228,0.832253


## Look at feature importance from best models

In [41]:
# # Ref: https://towardsdatascience.com/interpreting-random-forest-and-other-black-box-models-like-xgboost-80f9cc4a3c38

# knn_smote_under_feature_imp = pd.DataFrame({'Variable':X.columns,
#               'Importance':et.feature_importances_}).sort_values('Importance', ascending=False)

# knn_smote_under_feature_imp[:30]

In [42]:
# Ref: https://towardsdatascience.com/interpreting-random-forest-and-other-black-box-models-like-xgboost-80f9cc4a3c38

et_smote_under_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':et.feature_importances_}).sort_values('Importance', ascending=False)

et_smote_under_feature_imp[:30]



Unnamed: 0,Variable,Importance
6,policy_length,0.088564
5,Age,0.052842
2,Length,0.052153
3,Hull Limit,0.051532
1,Year Built,0.050719
0,Years Exp.,0.047541
9,New/Renl/Endt/Canc/Flat_new,0.028803
24,Occupation_other,0.021501
12,Married yes/no_yes,0.01987
4,# Engines,0.019605


In [43]:
# Compare to feature importance of second best model

rf_smote_under_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':rf.feature_importances_}).sort_values('Importance', ascending=False)

top_30 = rf_smote_under_feature_imp[:30]

In [44]:
# Get only top 30 important features to use in an updated model 
post_rf_model_features = [x for x in top_30['Variable']]

## Redo models again using only top 30 features

In [45]:
binary_scores2 = pd.DataFrame(columns=['Accuracy', 'Recall','Weighted F1 Score'])
binary_scores2

Unnamed: 0,Accuracy,Recall,Weighted F1 Score


In [46]:
X = combined[post_rf_model_features]
y = combined['binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

In [47]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [48]:
over = SMOTE(sampling_strategy=0.1, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_sm_und, y_sm_und = pipeline.fit_resample(X_train_sc,y_train)

In [49]:
knn = KNeighborsClassifier()
knn.fit(X_sm_und, y_sm_und)
knn_pred = knn.predict(X_test_sc)

lr = LogisticRegression(random_state=42)
lr.fit(X_sm_und, y_sm_und)
lr_pred = lr.predict(X_test_sc)

rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_sm_und, y_sm_und)
rf_pred = rf.predict(X_test_sc)

et = ExtraTreesClassifier(n_estimators=100,random_state=42)
et.fit(X_sm_und, y_sm_und)
et_pred = et.predict(X_test_sc)

logreg_cv = LogisticRegressionCV(Cs=10, cv=5, penalty="l1", solver="liblinear", random_state=42)
logreg_cv.fit(X_sm_und, y_sm_und)
logreg_cv_pred = logreg_cv.predict(X_test_sc)

In [50]:
smote_knn = pd.Series(data=[accuracy_score(y_test, knn_pred), recall_score(y_test, knn_pred),
                              f1_score(y_test, knn_pred ,average='weighted')], index=binary_scores2.columns, name = 'KNN(SMOTE/Under)')

smote_lr = pd.Series(data=[accuracy_score(y_test, lr_pred), recall_score(y_test, lr_pred),
                              f1_score(y_test, lr_pred ,average='weighted')], index=binary_scores2.columns, name = 'LR(SMOTE/Under)')

smote_rf = pd.Series(data=[accuracy_score(y_test, rf_pred), recall_score(y_test, rf_pred),
                              f1_score(y_test, rf_pred ,average='weighted')], index=binary_scores2.columns, name = 'RF(SMOTE/Under))')

smote_et = pd.Series(data=[accuracy_score(y_test, et_pred), recall_score(y_test, et_pred),
                              f1_score(y_test, et_pred ,average='weighted')], index=binary_scores2.columns, name = 'ET(SMOTE/Under))')

smote_logreg_cv = pd.Series(data=[accuracy_score(y_test, logreg_cv_pred), recall_score(y_test, logreg_cv_pred),
                              f1_score(y_test, logreg_cv_pred ,average='weighted')], index=binary_scores2.columns, name = 'LOGREG(SMOTE/Under))')

binary_scores2 = binary_scores2.append([smote_knn, smote_lr, smote_rf, smote_et, smote_logreg_cv])

print('New_Scores')
binary_scores2

New_Scores


Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(SMOTE/Under),0.75,0.445545,0.800925
LR(SMOTE/Under),0.868297,0.247525,0.872423
RF(SMOTE/Under)),0.884858,0.445545,0.892301
ET(SMOTE/Under)),0.854101,0.386139,0.869063
LOGREG(SMOTE/Under)),0.902997,0.118812,0.885967


In [51]:
print('Old_Scores')
binary_scores[-5:]

Old_Scores


Unnamed: 0,Accuracy,Recall,Weighted F1 Score
KNN(SMOTE/Under),0.766562,0.376238,0.810732
LR(SMOTE/Under),0.803628,0.277228,0.832253
RF(LR(SMOTE/Under)),0.898265,0.425743,0.901056
ET(LR(SMOTE/Under)),0.865931,0.376238,0.876633
LRCV(SMOTE/Under),0.822555,0.267327,0.844014


**INTERPRETATION:** Scores improved after using only the top 30 most important features from the Random Forest model.

In [52]:
print('KNN(SMOTE/UNDER) SECOND Training Results:')
print(knn.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, knn.predict(X_sm_und)))

print('\nRF(SMOTE/UNDER) SECOND Testing Results:')
print(knn.score(X_test_sc, y_test))
print(recall_score(y_test, knn.predict(X_test_sc)))

KNN(SMOTE/UNDER) SECOND Training Results:
0.7982832618025751
0.6781115879828327

RF(SMOTE/UNDER) SECOND Testing Results:
0.75
0.44554455445544555


In [53]:
print('LR(SMOTE/UNDER) SECOND Training Results:')
print(lr.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, lr.predict(X_sm_und)))

print('\nLR(SMOTE/UNDER) SECOND Testing Results:')
print(lr.score(X_test_sc, y_test))
print(recall_score(y_test, lr.predict(X_test_sc)))

LR(SMOTE/UNDER) SECOND Training Results:
0.6859799713876967
0.2532188841201717

LR(SMOTE/UNDER) SECOND Testing Results:
0.8682965299684543
0.24752475247524752


In [54]:
print('RF(SMOTE/UNDER) SECOND Training Results:')
print(rf.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, rf.predict(X_sm_und)))

print('\nRF(SMOTE/UNDER) SECOND Testing Results:')
print(rf.score(X_test_sc, y_test))
print(recall_score(y_test, rf.predict(X_test_sc)))

RF(SMOTE/UNDER) SECOND Training Results:
1.0
1.0

RF(SMOTE/UNDER) SECOND Testing Results:
0.8848580441640379
0.44554455445544555


In [55]:
print('ET(SMOTE/UNDER) SECOND Training Results:')
print(et.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, et.predict(X_sm_und)))

print('\nET(SMOTE/UNDER) SECOND Testing Results:')
print(et.score(X_test_sc, y_test))
print(recall_score(y_test, et.predict(X_test_sc)))

ET(SMOTE/UNDER) SECOND Training Results:
1.0
1.0

ET(SMOTE/UNDER) SECOND Testing Results:
0.8541009463722398
0.38613861386138615


---
## Gridsearch over KNN to fine tune model

In [56]:
# Ref for gridsearching for recall score: https://stackoverflow.com/questions/49035011/get-precison-model-through-gridsearchcv-for-recall-optimization
knn_params = {
    'n_neighbors':range(2, 5),
    'metric': ['euclidean', 'manhattan']
}

knn_gridsearch = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, 
                              verbose=1, scoring='recall')

In [57]:
knn_gridsearch.fit(X_sm_und, y_sm_und);

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.0s finished


In [58]:
print(knn_gridsearch.best_score_)
print(knn_gridsearch.best_params_)

0.5344314802104781
{'metric': 'euclidean', 'n_neighbors': 3}


In [59]:
print('KNN Gridsearch Training Results:')
print(accuracy_score(y_sm_und, knn_gridsearch.predict(X_sm_und)))
print(knn_gridsearch.score(X_sm_und, y_sm_und))

print('\nKNN Gridsearch Testing Results:')
print(accuracy_score(y_test, knn_gridsearch.predict(X_test_sc)))
print(knn_gridsearch.score(X_test_sc, y_test))


KNN Gridsearch Training Results:
0.8426323319027181
0.7682403433476395

KNN Gridsearch Testing Results:
0.748422712933754
0.4752475247524752


In [60]:
# Save the best model
knn_model = knn_gridsearch

## Gridsearch over RF to fine tune model

In [61]:
rf_params = {
    'n_estimators': [50,75,100],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
rf_gs = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, cv=5, 
                     verbose= 1, scoring= 'recall')
rf_gs.fit(X_sm_und, y_sm_und)
print(rf_gs.best_score_)
rf_gs.best_params_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   16.9s finished


0.4379775795012583


{'max_depth': None, 'n_estimators': 50}

In [62]:
print('RF Gridsearch Training Results:')
print(accuracy_score(y_sm_und, rf_gs.predict(X_sm_und)))
print(rf_gs.score(X_sm_und, y_sm_und))


print('\nRF Gridsearch Testing Results:')
print(accuracy_score(y_test, rf_gs.predict(X_test_sc)))
print(rf_gs.score(X_test_sc, y_test))


RF Gridsearch Training Results:
1.0
1.0

RF Gridsearch Testing Results:
0.8777602523659306
0.36633663366336633


## Gridsearch over ET to fine tune model

In [None]:
et_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
et_gs = GridSearchCV(ExtraTreesClassifier(), param_grid=et_params, cv=5, 
                     verbose= 1, scoring= 'recall')
et_gs.fit(X_sm_und, y_sm_und)
print(et_gs.best_score_)
et_gs.best_params_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print('ET Gridsearch Training Results:')
print(accuracy_score(y_sm_und, et_gs.predict(X_sm_und)))
print(et_gs.score(X_sm_und, y_sm_und))


print('\nET Gridsearch Testing Results:')
print(accuracy_score(y_test, et_gs.predict(X_test_sc)))
print(et_gs.score(X_test_sc, y_test))




---

In [None]:
# # https://machinelearningmastery.com/calculate-feature-importance-with-python/

# results = permutation_importance(knn_gridsearch, X_sm_und, y_sm_und, scoring='recall')
# # get importance
# importance = results.importances_mean
# # summarize feature importance
# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))
# # # plot feature importance
# # pyplot.bar([x for x in range(len(importance))], importance)
# # pyplot.show()

In [None]:
# # Reminder of best params for RF
# rf_gs.best_params_

# # Instantiate model w/ best params in order to pull the feature importances
# new_rf = RandomForestClassifier(max_depth=None, n_estimators=75,random_state=42)
# new_rf.fit(X_sm_und, y_sm_und)
# new_rf_pred = new_rf.predict(X_test_sc)

# new_rf_feature_imp = pd.DataFrame({'Variable':X.columns,
#               'Importance':new_rf.feature_importances_}).sort_values('Importance', ascending=False)

# new_rf_feature_imp

## Interpreting best model: Random Forest

In [None]:
# Reminder of best params for RF
rf_gs.best_params_

In [None]:
best_model_rf = RandomForestClassifier(max_depth=None, n_estimators=50,random_state=42)
best_model_rf.fit(X_sm_und, y_sm_und)
bm_rf_pred = best_model_rf.predict(X_test_sc)

In [None]:
# Best model KNN scores
print('Best Model, RF: Training Results:')
print(best_model_rf.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, best_model_rf.predict(X_sm_und)))

print('\nBest Model, RF: Testing Results:')
print(best_model_rf.score(X_test_sc, y_test))
print(recall_score(y_test, best_model_rf.predict(X_test_sc)))

**INTERPRETATION:** Model is overfit will try to play around with the feature selection below.

In [None]:
bm_probs = best_model_rf.predict_proba(X_test_sc)
bm_probs

In [None]:
best_rf_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':best_model_rf.feature_importances_}).sort_values('Importance', ascending=False)

best_rf_feature_imp

----
## Test best model with 'Policy Length' and 'New/Renl/Endt/Canc/Flat' removed

In [None]:
removals = [col for col in combined if col.startswith('New/Renl/Endt/Canc/Flat')]
removals.append('policy_length')
removals.append('num_claims')
removals.append('binary')
removals

In [None]:
X = combined.drop(columns=removals)
y = combined['binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

over = SMOTE(sampling_strategy=0.1,random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5,random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_sm_und, y_sm_und = pipeline.fit_resample(X_train_sc, y_train)

rf = RandomForestClassifier(max_depth=None, n_estimators=50,random_state=42)
rf.fit(X_sm_und, y_sm_und)
rf_pred = rf.predict(X_test_sc)

In [None]:
# New RF scores
print('New RF: Training Results:')
print(rf.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, rf.predict(X_sm_und)))

print('\nBest Model, RF: Testing Results:')
print(rf.score(X_test_sc, y_test))
print(recall_score(y_test, rf.predict(X_test_sc)))

In [None]:
rf_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':rf.feature_importances_}).sort_values('Importance', ascending=False)

rf_feature_imp

### Try it after keeping only the top 30 features from above

In [None]:
# Get the first 30 rows of the important features
new_top_30 = rf_smote_under_feature_imp[:30]

# Make a list of the column names of the top 30
new_top_30_features = [x for x in new_top_30['Variable']]

new_top_30_features

In [None]:
# # Create a list of columns we don't want in the final list of features
# drops = ['policy_length','New/Renl/Endt/Canc/Flat_new','New/Renl/Endt/Canc/Flat_renl','New/Renl/Endt/Canc/Flat_endt']

# # Create a new list of features we will use for X
# final_features = [x for x in new_top_30 if x not in drops]

# final_features

In [None]:
X = combined[new_top_30_features]
y = combined['binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

over = SMOTE(sampling_strategy=0.1,random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5,random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_sm_und, y_sm_und = pipeline.fit_resample(X_train_sc, y_train)

rf = RandomForestClassifier(max_depth=None, n_estimators=50,random_state=42)
rf.fit(X_sm_und, y_sm_und)
rf_pred = rf.predict(X_test_sc)

In [None]:
# New KNN scores
print('Final RF: Training Results:')
print(rf.score(X_sm_und, y_sm_und))
print(recall_score(y_sm_und, rf.predict(X_sm_und)))

print('\nFinal RF: Testing Results:')
print(rf.score(X_test_sc, y_test))
print(recall_score(y_test, rf.predict(X_test_sc)))

In [None]:
final_rf_feature_imp = pd.DataFrame({'Variable':X.columns,
              'Importance':rf.feature_importances_}).sort_values('Importance', ascending=False)

final_rf_feature_imp

**INTERPRETATION:** 

---
## Conclusion:

In this notebook we...