In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import matplotlib.pyplot as plt

In [2]:
target_col = "TARGET_FLAG"

feature_cols = ['KIDSDRIV',
 'AGE',
 'HOMEKIDS',
 'YOJ',
 'INCOME',
 'PARENT1',
 'HOME_VAL',
 'MSTATUS',
 'SEX',
 'EDUCATION',
 'JOB',
 'TRAVTIME',
 'CAR_USE',
 'BLUEBOOK',
 'TIF',
 'CAR_TYPE',
 'RED_CAR',
 'OLDCLAIM',
 'CLM_FREQ',
 'REVOKED',
 'MVR_PTS',
 'CAR_AGE',
 'URBANICITY']

## Approach

We will test one linear model, i.e. logistic regression, and one non-linear model, i.e. XGBoost, to offer some contrast.

From EDA notebook, we know that accuracy might not be a good mertic due to imbalanced data and low separability of the target using most of the features. Due to lack of domain knowlegde it is not sure if false positive or false negative costs more to the business. Hence we will look at 4 metrics:

1) Accuracy

2) ROC-AUC: focus on positive class 

3) F1: focus on positive class 

4) Matthew's correlation coefficient: symmetric and robust to class imbalance 


As a benchmark, blind guess based on majority class (0) will achieve an accuracy of 0.73.

# I. Logistic Regression training

## Load Data

In [3]:

import pandas as pd

train_df = pd.read_csv('./data/auto-insurance-fall-2017/train_auto.csv')
test_df = pd.read_csv('./data/auto-insurance-fall-2017/test_auto.csv')

train_df = train_df[feature_cols + [target_col]]
test_df = test_df[feature_cols + [target_col]]

## Cleaning

In [4]:
from re import sub
import numpy as np

def convert_monetary_str_to_float(s):
    if s != s:
        return 0.0
    return np.float(sub(r'[^\d.]', '', s))

cols_monetary = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM']


for col in cols_monetary:
    train_df[col] = train_df[col].apply(lambda x : convert_monetary_str_to_float(x))
    test_df[col] = test_df[col].apply(lambda x : convert_monetary_str_to_float(x))


    

In [5]:
def convert_yes_no_str_to_boolean(s):
    if s != s:
        return s
    if 'yes' in s.lower():
        return True
    return False

cols_boolean = ['PARENT1', 'MSTATUS', 'RED_CAR', 'REVOKED']
for col in cols_boolean:
    train_df[col] = train_df[col].apply(lambda x : convert_yes_no_str_to_boolean(x))
    test_df[col] = test_df[col].apply(lambda x : convert_yes_no_str_to_boolean(x))


## Preprocessors

In [6]:
transformers = []

### Boolean columns
For each column, impute missing values and then convert into ones and zeros.

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


bool_pipeline = Pipeline(steps=[
    ("cast_type", FunctionTransformer(lambda df: df.astype(object))),
    ("imputer", SimpleImputer(missing_values=None, strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

transformers.append(("boolean", bool_pipeline, cols_boolean))

### Categorical columns

#### Low-cardinality categoricals
Convert each low-cardinality categorical column into multiple binary columns through one-hot encoding.
For each input categorical column (string or numeric), the number of output columns is equal to the number of unique values in the input column.

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

cols_cat = ['SEX', 'EDUCATION', 'JOB', 'CAR_USE', 'CAR_TYPE', 'URBANICITY', 'PARENT1', 'MSTATUS', 'RED_CAR', 'REVOKED']

one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

transformers.append(("onehot", one_hot_encoder,  cols_cat))

In [9]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

### Feature standardization
Scale all feature columns to be centered around zero with unit variance.

In [10]:

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

cols_numeric = ['AGE',
 'BLUEBOOK',
 'CAR_AGE',
 'CLM_FREQ',
 'HOMEKIDS',
 'HOME_VAL',
 'INCOME',
 'KIDSDRIV',
 'MVR_PTS',
 'OLDCLAIM',
 'TIF',
 'TRAVTIME',
 'YOJ']

scaler = MinMaxScaler()

numeric_pipeline = Pipeline(steps=[
    ("cast_type", FunctionTransformer(lambda df: df.astype(np.number))),
    ("imputer", KNNImputer(n_neighbors=5)),
])

transformers.append(("numeric", numeric_pipeline, cols_numeric))




## Train - Validation - Test Split


In [11]:
from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=[target_col]), train_df[[target_col]], train_size=0.8, random_state=176283341, stratify=train_df[[target_col]])


X_train = train_df.drop(columns=[target_col])
y_train = train_df[[target_col]]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[[target_col]]

## Train classification model


In [12]:
from sklearn.linear_model import LogisticRegression


In [13]:

import sklearn
from sklearn import set_config
from sklearn.pipeline import Pipeline

set_config(display="diagram")

sklr_classifier = LogisticRegression()

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", sklr_classifier),
])

model

In [14]:

from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, matthews_corrcoef, make_scorer

accuracy_scorer = make_scorer(accuracy_score)
roc_auc_scorer = make_scorer(roc_auc_score)
f1_scorer = make_scorer(f1_score)
matthews_corrcoef = make_scorer(matthews_corrcoef)

scores = cross_validate(model, X_train, y_train, cv=3,
                        scoring=({'accu': accuracy_scorer, 
                                  'roc_auc': roc_auc_scorer,
                                 'f1': f1_scorer,
                                 'mcc':matthews_corrcoef  }),
                        return_train_score=True)


scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([0.44074202, 0.23641014, 0.25869012]),
 'score_time': array([0.10006905, 0.11837387, 0.11442494]),
 'test_accu': array([0.73465638, 0.72830882, 0.73455882]),
 'train_accu': array([0.73455882, 0.73644551, 0.73221834]),
 'test_roc_auc': array([0.51597812, 0.50749225, 0.51373974]),
 'train_roc_auc': array([0.51162093, 0.51834067, 0.51469642]),
 'test_f1': array([0.0952381 , 0.0727729 , 0.08375635]),
 'train_f1': array([0.07317073, 0.10150376, 0.0967142 ]),
 'test_mcc': array([0.08337373, 0.03907466, 0.07649462]),
 'train_mcc': array([0.06889887, 0.09569861, 0.07281089])}

**Comment:**

ROC-AUC ~= 0.5, MCC ~= 0, F1 ~= 0

Not good given that blind guess based on majority class (0) will achieve an accuracy of 0.73. Classifier performs badly on rare class (1).

# II. XGBoost training

In [15]:
from xgboost import XGBClassifier


set_config(display="diagram")

xgbc_classifier = XGBClassifier(
#   colsample_bytree=0.786543024582424,
#   learning_rate=0.0010953705178519775,
#   max_depth=11,
#   min_child_weight=1,
#   n_estimators=3,
#   n_jobs=100,
#   subsample=0.2039605310017424,
#   verbosity=0,
#   random_state=176283341,
)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", xgbc_classifier),
])

model

In [16]:
scores = cross_validate(model, X_train, y_train, cv=3,
                        scoring=({'accu': accuracy_scorer, 
                                  'roc_auc': roc_auc_scorer,
                                 'f1': f1_scorer,
                                 'mcc':matthews_corrcoef  }),
                        return_train_score=True)

scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




{'fit_time': array([0.67555618, 0.83871698, 0.80696774]),
 'score_time': array([0.11564803, 0.14323306, 0.14122486]),
 'test_accu': array([0.78684307, 0.78198529, 0.78272059]),
 'train_accu': array([0.98731618, 0.99007535, 0.98823746]),
 'test_roc_auc': array([0.69259898, 0.67915038, 0.6773192 ]),
 'train_roc_auc': array([0.97774689, 0.98209115, 0.97993654]),
 'test_f1': array([0.54968944, 0.52749004, 0.52453741]),
 'train_f1': array([0.97550586, 0.98089172, 0.97735315]),
 'test_mcc': array([0.41718535, 0.39630475, 0.39608017]),
 'train_mcc': array([0.96726402, 0.97442657, 0.96962579])}

**Comment:**

Significant improvement over logistic regression in terms of MCC and F1, suggesting the classifier performs better on rare class (1). 

The relationshiop between features and target is most likely non-linear. Linear models may have improved performance after applying technqiues such as polynomial feature engineering.

Update the default 0.5 threshold may help to achieve better performance. But due to time constraint this step will be skipped. 


In [17]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




In [18]:
res = pd.DataFrame()
res['p_target'] = y_pred
res.to_csv('predictions.csv')