In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

Our aim is to train machine learning models on our data in order to predict TARGET_FLAG for the test-auto data

## Data preprocessing

In [2]:
df_train = pd.read_csv("preprocessed_train.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,TARGET_FLAG,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,Student,z_Blue Collar,Commercial,Minivan,Panel Truck,Pickup,Sports Car,Van,z_SUV,Highly Urban/ Urban
0,0,0,0,60.0,0,11.0,11.117643,0,1.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,0,0,43.0,0,11.0,11.423537,0,12.457811,0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,0,0,35.0,1,10.0,9.682779,0,11.729576,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,3,0,0,51.0,0,14.0,,0,12.63216,1,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,0,0,50.0,0,,11.652566,0,12.404616,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [3]:
df_test = pd.read_csv("preprocessed_test.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,TARGET_FLAG,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,Student,z_Blue Collar,Commercial,Minivan,Panel Truck,Pickup,Sports Car,Van,z_SUV,Highly Urban/ Urban
0,0,,0,48.0,0,11.0,10.875799,0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,,1,40.0,1,11.0,10.835947,1,1.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,,0,44.0,2,12.0,10.680194,1,1.0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,,0,35.0,2,,9.961945,1,1.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,,0,59.0,0,12.0,11.378937,0,1.0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
TARGET = 'TARGET_FLAG'

y1 = df_train[TARGET].values
X1 = df_train.drop(columns=TARGET)
y1

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
X2 = df_test.drop(columns=TARGET)


In [6]:
numeric_features = ['AGE',
                    'YOJ',
                    'INCOME',
                    'HOME_VAL',
                    'BLUEBOOK',
                    'OLDCLAIM',
                    'CAR_AGE']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(missing_values = np.nan, strategy='constant', fill_value=0))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),],
    remainder='passthrough')

## Machine learning Model:
- logistic regression

In [7]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),                # In order to perform preprocessing
                      ('clf', LogisticRegression(random_state=1,      # In order to instanciate a model
                                                solver='liblinear',
                                                max_iter=300))])

## Data split

In [8]:
random_state = 4
n_splits = 4

# We keep one fold for validation
X_for_gridsearch, X_future_validation, y_for_gridsearch, y_future_validation \
= train_test_split(X1, y1, test_size=0.1, random_state=random_state, stratify=y1)

# We use k-fold on the remaining data to search over hyper-parameters
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

## Training the model

In [9]:
param_grid = dict(clf__penalty = ['l1', 'l2'],
                  clf__C       = np.logspace(-2, 3, 6))

grid = GridSearchCV(pipe, 
                    param_grid=param_grid,
                    cv=kf, 
                    n_jobs=1, 
                    verbose=1,
                    scoring='f1',
                    return_train_score=True)
grid.fit(X_for_gridsearch, y_for_gridsearch)
print(grid.best_score_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
0.5068147316458314


In [10]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_clf__C',
            'param_clf__penalty', 
            'split0_test_score',
            'split1_test_score',
            'split2_test_score',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(4)

Unnamed: 0,param_clf__C,param_clf__penalty,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,100.0,l1,0.5,0.545676,0.499369,0.506815,0.023544,1
10,1000.0,l1,0.5,0.545676,0.499366,0.506814,0.023544,2
6,10.0,l1,0.5,0.545676,0.498741,0.506657,0.023595,3
4,1.0,l1,0.494297,0.54678,0.497462,0.506167,0.02381,4


## accuracy of the model

In [11]:
y_pred = grid.predict(X_future_validation)
f1_score(y_future_validation, y_pred)

0.5111111111111111

In [12]:
y_pred_test = grid.predict(X2)
y_pred_test


array([0, 0, 0, ..., 0, 0, 0])

## Model2:

In [13]:

numeric_features = ['AGE',
                    'INCOME',
                    'HOME_VAL',
                    'BLUEBOOK',
                    'OLDCLAIM',
                    'CAR_AGE']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values = np.nan, strategy='constant', fill_value=0))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),],
    remainder='passthrough')



model = XGBClassifier(objective='binary:logistic',
                              use_label_encoder=False,
                              eval_metric='logloss',
                              random_state='42')

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', model)])

## Splitting the data

In [14]:
from sklearn.model_selection import train_test_split, StratifiedKFold

random_state = 4
n_splits = 4

# We keep one fold for validation
X_for_gridsearch, X_future_validation, y_for_gridsearch, y_future_validation \
= train_test_split(X1, y1, test_size=0.1, random_state=random_state, stratify=y1)

# We use k-fold on the remaining data to search over hyper-parameters
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

## Training the model

In [15]:
from sklearn.model_selection import RandomizedSearchCV

param_grid  = dict(clf__n_estimators  = np.linspace(100, 300, 6).astype(int),
                           clf__max_depth     = [5, 8, 10, 15, 20, 30, 50, 75, 100],
                           clf__alpha         = [0.01, 0.05, 0.1, 0.3, 0.5, 1, 10],
                           clf__learning_rate = [0.1, 0.08, 0.05, 0.02, 0.01],
                 )

grid = RandomizedSearchCV(pipe,
                          param_distributions=param_grid,
                          cv=kf,
                          verbose=1,
                          scoring='f1',
                          random_state=42,
                          n_iter=20,
                         )
grid.fit(X_for_gridsearch, y_for_gridsearch)
print(grid.best_score_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
0.5334962839792531


In [16]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['split0_test_score',
            'split1_test_score',
            'split2_test_score',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(4)

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.529769,0.55648,0.506732,0.533496,0.01813,1
16,0.514354,0.538647,0.535452,0.529015,0.009369,2
4,0.524104,0.539299,0.511278,0.52692,0.010522,3
6,0.520245,0.54303,0.517968,0.524503,0.010769,4


## Accuracy of the model with the prediction on the test-auto:

In [17]:
from sklearn.metrics import f1_score

y_pred = grid.predict(X_future_validation)
f1_score(y_future_validation, y_pred)

0.5937499999999999

In [18]:
y_pred_test2 = grid.predict(X2)
y_pred_test2 

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
y_preds=[y_pred_test,y_pred_test2]
y_preds = np.mean(y_preds,axis=0)
y_preds = (y_preds > 0.5).astype(int)

In [22]:
# Generate and save submissions
df_sub = pd.DataFrame()
df_sub['TARGET_FLAG'] = y_preds

In [31]:
df_sub.to_csv('submission.csv')