In [55]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm

In [43]:
data = pd.read_csv("data/caravan-insurance-challenge-df_train.csv")
data.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,0
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,0
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,0
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,0
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,0


In [44]:
#print ratio of 0 and 1
print("Ratio of 0 and 1 in the target variable:")
print(data["CARAVAN"].value_counts()/len(data))

Ratio of 0 and 1 in the target variable:
CARAVAN
0    0.940227
1    0.059773
Name: count, dtype: float64


In [45]:
# In order to correct multicollinearity effect, we drop the probability features
columns_to_drop = [name for name in data.columns if "P" == name[0]]
data = data.drop(columns_to_drop, axis=1)

In [46]:
data.shape

(5822, 65)

In [50]:
# define the matrices of the data
X_train = data.drop(["CARAVAN"], axis=1)
y_train = data["CARAVAN"]


In [53]:
logit_model=sm.Logit(y_train,X_train)
#print the summary of the model
result=logit_model.fit_regularized(alpha = 0.005)
print(result.summary2())

Iteration limit reached    (Exit mode 9)
            Current function value: 0.19866137779008206
            Iterations: 1000
            Function evaluations: 1008
            Gradient evaluations: 1000
                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: CARAVAN          Pseudo R-squared: 0.122     
Date:               2024-01-31 14:30 AIC:              2440.9670 
No. Observations:   5822             BIC:              2867.8086 
Df Model:           63               Log-Likelihood:   -1156.5   
Df Residuals:       5758             LL-Null:          -1317.8   
Converged:          0.0000           LLR p-value:      1.6163e-36
No. Iterations:     1000.0000        Scale:            1.0000    
------------------------------------------------------------------
              Coef.   Std.Err.     z     P>|z|    [0.025    0.975]
------------------------------------------------------------------
MOSTYPE       0.0540    0.0

Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers


In [49]:
y_pred = logistic_model.predict(X_train)

ValueError: X has 64 features, but LogisticRegression is expecting 85 features as input.

In [None]:
#print accuracy, precision, recall and f1-score on the train set
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5474
           1       0.42      0.01      0.03       348

    accuracy                           0.94      5822
   macro avg       0.68      0.51      0.50      5822
weighted avg       0.91      0.94      0.91      5822



In [63]:
# test the model on the test set
data_test = pd.read_csv("data/caravan-insurance-challenge-df_test.csv")
data_test.drop(columns_to_drop, axis=1, inplace=True)
X_test = data_test.drop(["CARAVAN"], axis=1)
y_test = data_test["CARAVAN"]


In [64]:
random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
random_forest_model.fit(X_train, y_train)

In [65]:
random_forest_model.score(X_test, y_test)

0.94

In [66]:
y_pred = random_forest_model.predict(X_test)
# print accuracy, precision, recall and f1-score on the test set
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3762
           1       0.00      0.00      0.00       238

    accuracy                           0.94      4000
   macro avg       0.47      0.50      0.48      4000
weighted avg       0.88      0.94      0.91      4000



In [67]:
sum(y_pred)

2

## Tackle umbalacedness

In [86]:
# undersample the majority class
data_0 = data[data["CARAVAN"] == 0]
data_1 = data[data["CARAVAN"] == 1]
data_0_under = data_0.sample(len(data_1))
data_under = pd.concat([data_0_under, data_1], axis=0)
data_under["CARAVAN"].value_counts()


CARAVAN
0    348
1    348
Name: count, dtype: int64

In [87]:
# retrain the model on the undersampled data
X_train_under = data_under.drop(["CARAVAN"], axis=1)
y_train_under = data_under["CARAVAN"]

random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
random_forest_model.fit(X_train_under, y_train_under)
print(classification_report(y_train, random_forest_model.predict(X_train)))
print(classification_report(y_test, y_pred))

confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.66      0.80      5474
           1       0.16      0.98      0.27       348

    accuracy                           0.68      5822
   macro avg       0.58      0.82      0.53      5822
weighted avg       0.95      0.68      0.76      5822

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3762
           1       0.00      0.00      0.00       238

    accuracy                           0.94      4000
   macro avg       0.47      0.50      0.48      4000
weighted avg       0.88      0.94      0.91      4000



array([[3760,    2],
       [ 238,    0]])

In [83]:
# oversample the minority class
data_1_over = data_1.sample(len(data_0), replace=True)
data_over = pd.concat([data_0, data_1_over], axis=0)
data_over["CARAVAN"].value_counts()


CARAVAN
0    5474
1    5474
Name: count, dtype: int64

In [85]:
# retrain the model on the oversampled data
X_train_over = data_over.drop(["CARAVAN"], axis=1)
y_train_over = data_over["CARAVAN"]

random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
random_forest_model.fit(X_train_over, y_train_over)
print(classification_report(y_train, random_forest_model.predict(X_train)))
print(classification_report(y_test, y_pred))

confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       1.00      0.97      0.99      5474
           1       0.69      1.00      0.82       348

    accuracy                           0.97      5822
   macro avg       0.85      0.99      0.90      5822
weighted avg       0.98      0.97      0.98      5822

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3762
           1       0.00      0.00      0.00       238

    accuracy                           0.94      4000
   macro avg       0.47      0.50      0.48      4000
weighted avg       0.88      0.94      0.91      4000



array([[3760,    2],
       [ 238,    0]])