# Challenge Avazu Kaggle

Auteur: Alphonse Doutriaux
Date: février 2018

## 0. Préliminaires

In [59]:
%matplotlib inline

import warnings
import numpy as np
import pandas as pd
import random
import gzip
import io
import multiprocessing
import matplotlib
import datetime
import joblib
import pickle as pkl

warnings.filterwarnings('ignore')

In [60]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, roc_auc_score, log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification

from joblib import Parallel, delayed
from matplotlib import pyplot as plt
from datetime import date, datetime
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from xgboost import XGBClassifier

## 1. Préparation des données

### 1.1 Extraction d'un sample aléatoire de 1M de lignes

In [136]:
df = pd.read_csv("train_sample.csv")

### 1.2 Extraction de lignes de manière aléatoire

In [None]:
line_count = 40428968

In [None]:
n = 100000
skip = sorted(random.sample(range(1,line_count + 1), line_count - n))

In [None]:
df = pd.read_csv("train.gz", skiprows=skip, index_col=0)

In [None]:
# en cas d'extraction à garder
df.to_csv(path_or_buf= './train_sample.csv', index = False, sep = ',')

### 1.3 Preprocessing

#### Préparation des colonnes "weekday", "hour" et "surface"

In [137]:
# Weekday
df["weekday"] = df["hour"].apply(lambda x: datetime.strptime(str(x), '%y%m%d%H').weekday())

# Hour
df["hour"] = df["hour"].apply(lambda x: int(str(x)[-2:]))

# Surface
df["area"] = df["C15"] * df["C16"]

#### Extraction d'un user_id et création de deux colonnes : fréquence d'apparition et nombre moyen de clics 

Création d'une colonne `User_freq`

In [140]:
df['User'] = df['device_id'] + df['device_ip'] + df['device_model'] #on crée une feature user 
values = df['User'].value_counts() # on remplace la valeur de user par sa fréquence d'apparition
df['User_freq'] = df['User'].apply(lambda row: values[row])

Création d'une colonne `User_clicks`

In [160]:
i=0
for user in df['User']:
    df['User_clicks'] = df[df['User'] == user]['click'].mean()
    i=i+1
    if i%1e3==0:
        print(i)

1000


KeyboardInterrupt: 

#### Séparation X et y

In [66]:
y = df[['click']]
X = df[['C1', 'hour', 'banner_pos', 'site_category', 'app_category', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 
           'C19', 'C20','C21', 'area', 'weekday', 'User_freq']]

#### Préparation de la colonne C20

In [68]:
X['C20'] = X['C20'].replace(-1, np.nan)
X['C20'] = X['C20'].replace(np.nan, X['C20'].median())

#### One hot encoding avec get_dummies

In [69]:
columns_to_encode = ['device_type', 'device_conn_type', 'site_category', 'app_category', 'banner_pos', 'C18']

In [70]:
df_full_columns = pd.read_csv("train.gz", usecols=columns_to_encode)

In [71]:
for col in columns_to_encode:
    X[col] = X[col].astype('category', categories = df_full_columns[col].unique().tolist())

In [72]:
X = pd.get_dummies(X, columns=columns_to_encode, prefix = columns_to_encode)

### 1.4. Préparation des folds

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle=False)

## 2. LogisticRegression

In [None]:
penalty = ['l1', 'l2']
C = [0.05, 0.1, 0.5]

param_grid = dict(penalty=penalty, 
                  C=C)

lr_gs = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, scoring = 'neg_log_loss', verbose = 2)
grid_result_lr = lr_gs.fit(X_train,y_train)

In [None]:
print(lr_gs.best_score_)
print(lr_gs.best_params_)

In [74]:
lr = LogisticRegression(penalty='l1',
                        solver='liblinear',
                        C=0.1,
                        verbose=2)

In [89]:
lr.fit(X,y)

[LibLinear]

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=2, warm_start=False)

In [75]:
lr_scores = cross_val_score(lr,
                            X_train, 
                            y_train, 
                            cv=5, 
                            scoring="neg_log_loss", 
                            n_jobs=-1, 
                            verbose=2)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[LibLinear][CV] ................................................. , total=   4.9s
[LibLinear][CV] ................................................. , total=   5.7s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.9s remaining:    8.8s


[LibLinear][CV] ................................................. , total=   6.1s
[LibLinear][CV] ................................................. , total=   6.3s
[LibLinear][CV] ................................................. , total=   6.5s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [76]:
print("Average log_loss: {:.2%}".format(-lr_scores.mean()))
print("Interval: [", round(-lr_scores.mean()-3*lr_scores.std(),4), ";", round(-lr_scores.mean()+3*lr_scores.std(),4),"]")

Average log_loss: 43.30%
Interval: [ 0.4114 ; 0.4546 ]


In [None]:
_ = joblib.dump(lr, './models/lr.pkl', compress=9)

## 3. XGBoost

### 3.1. GridSearch

In [None]:
# Grid search
#learning_rate = [0.1, 0.15, 0.2]
#n_estimators = [10]
#colsample_bytree = [0.5, 0.7, 0.9]
max_depth = [3, 5, 7]
#reg_alpha = [0.1, 0.5, 1]
#reg_lambda = [0.1, 0.5, 1]

param_grid = dict(#learning_rate=learning_rate, 
                  #n_estimators=n_estimators, 
                  #colsample_bytree=colsample_bytree, 
                  max_depth=max_depth)
                  #reg_alpha=reg_alpha,
                  #reg_lambda=reg_lambda)

xgb_gs = GridSearchCV(XGBClassifier(), param_grid, n_jobs=-1, scoring = 'neg_log_loss', verbose = 2)
grid_result_xgb = xgb_gs.fit(X_train,y_train)

In [None]:
print(xgb_gs.best_score_)
print(xgb_gs.best_params_)

### 3.2. Lancement du modèle

In [85]:
xgb = XGBClassifier(verbose=2,
                    n_jobs=-1,
                    max_depth=4, 
                    learning_rate=0.2,
                    colsample_bytree=0.9,
                    n_estimators=600,
                    reg_alpha=1,
                    reg_lambda=1, 
                    objective='binary:logistic',
                    booster='gbtree')

In [88]:
xgb.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, verbose=2)

In [86]:
xgb_scores = cross_val_score(xgb,
                             X_train,
                             y_train,
                             cv=5,
                             scoring="neg_log_loss",
                             n_jobs=-1)

In [87]:
print("Average log_loss: {:.2%}".format(-xgb_scores.mean()))
print("Interval: [", round(-xgb_scores.mean()-3*xgb_scores.std(),4), ";", round(-xgb_scores.mean()+3*xgb_scores.std(),4),"]")

Average log_loss: 80.78%
Interval: [ -0.6594 ; 2.275 ]


In [None]:
_ = joblib.dump(xgb, './models/xgb.pkl', compress=9)

## 4. RandomForests

In [80]:
rf = RandomForestClassifier(n_jobs=-1)

In [81]:
rf_scores = cross_val_score(rf,
                            X_train,
                            y_train,
                            cv=3,
                            scoring="neg_log_loss",
                            n_jobs=-1,
                            verbose=2)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.4s
[CV] ................................................. , total=   0.5s
[CV] ................................................. , total=   0.5s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


In [90]:
rf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:
print("Average log_loss: {:.2%}".format(-rf_scores.mean()))
print("Interval: [", round(-rf_scores.mean()-3*rf_scores.std(),4), ";", round(-rf_scores.mean()+3*rf_scores.std(),4),"]")

Average log_loss: 166.53%
Interval: [ 1.5398 ; 1.7908 ]


In [None]:
_ = joblib.dump(rf, './models/rf.pkl', compress=9)

## 5. Mélange

In [None]:
lr = joblib.load("./models/lr.pkl")
xgb = joblib.load("./models/xgb.pkl")
rf = joblib.load("./models/rf.pkl")

In [91]:
lr_preds = lr.predict_proba(X_test)
xgb_preds = xgb.predict_proba(X_test)
rf_preds = rf.predict_proba(X_test)

In [94]:
preds_table = pd.DataFrame({"LR":lr_preds[:,1], "XGBoost":xgb_preds[:,1], "RandomForests":rf_preds[:,1]}, index=X_test.index)

In [95]:
# grid search
penalty = ['l1', 'l2']
C = [0.05, 0.1, 0.5]

param_grid = dict(penalty=penalty, 
                  C=C)

lr_stacking = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, scoring = 'neg_log_loss', verbose = 2)
grid_search_results_blending = lr_stacking.fit(preds_table,y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.05, penalty=l1 ..............................................
[CV] C=0.05, penalty=l1 ..............................................
[CV] C=0.05, penalty=l1 ..............................................
[CV] C=0.05, penalty=l2 ..............................................
[CV] ............................... C=0.05, penalty=l1, total=   0.0s
[CV] C=0.05, penalty=l2 ..............................................
[CV] C=0.05, penalty=l2 ..............................................
[CV] ............................... C=0.05, penalty=l1, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] C=0.1, penalty=l1 ...............................................
[CV] C=0.1, penalty=l1 ...............................................
[CV] C=0.1, penalty=l2 ...............................................
[CV] ............................... C=0.05, penalty=l1, total=   0.1s
[CV] ............

[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:    0.1s remaining:    0.7s


[CV] ................................ C=0.5, penalty=l1, total=   0.2s
[CV] ............................... C=0.05, penalty=l2, total=   0.3s
[CV] ................................ C=0.1, penalty=l2, total=   0.3s
[CV] ............................... C=0.05, penalty=l2, total=   0.3s
[CV] C=0.5, penalty=l2 ...............................................
[CV] ................................ C=0.5, penalty=l1, total=   0.2s
[CV] C=0.5, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   0.2s
[CV] C=0.5, penalty=l1 ...............................................
[CV] ................................ C=0.5, penalty=l2, total=   0.1s
[CV] C=0.5, penalty=l2 ...............................................
[CV] ................................ C=0.5, penalty=l2, total=   0.1s
[CV] ................................ C=0.5, penalty=l2, total=   0.1s
[CV] ................................ C=0.5, penalty=l1, total=   0.1s


[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.7s finished


In [96]:
print(grid_search_results_blending.best_score_)
print(grid_search_results_blending.best_params_)

-0.20247563088362186
{'C': 0.5, 'penalty': 'l1'}


In [97]:
lr_blending = LogisticRegression(penalty='l1',
                                 solver='liblinear',
                                 C=0.5,
                                 verbose=2)

In [99]:
lr_blending.fit(preds_table, y_test)

[LibLinear]

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=2, warm_start=False)

## 5. Prédictions sur les données de test en csv

### 5.1. Feature engineering sur le fichier test.gz

In [120]:
test = pd.read_csv("test.gz")

In [121]:
test["weekday"] = test["hour"].apply(lambda x: datetime.strptime(str(x), '%y%m%d%H').weekday())
test["hour"] = test["hour"].apply(lambda x: int(str(x)[-2:]))
test["area"] = test["C15"] * test["C16"]

test['User_freq'] = test['device_id'] + test['device_ip'] + test['device_model'] #on crée une feature user 
values = test['User_freq'].value_counts() # on remplace la valeur de user par sa fréquence d'apparition
test['User_freq'] = test['User_freq'].apply(lambda row: values[row])

In [122]:
test = test[['C1', 'hour', 'banner_pos', 'site_category', 'app_category', 'device_type', 'device_conn_type', 'C14','C15', 'C16', 'C17', 'C18', 
           'C19', 'C20','C21', 'area', 'weekday', 'User_freq']]

In [123]:
test['C20'] = test['C20'].replace(-1, np.nan)
test['C20'] = test['C20'].replace(np.nan, X['C20'].median())

In [124]:
for col in columns_to_encode:
    test[col] = test[col].astype('category',categories = df_full_columns[col].unique().tolist())

In [125]:
test = pd.get_dummies(test, columns=columns_to_encode, prefix = columns_to_encode)

In [126]:
test.shape

(4577464, 94)

In [127]:
X_train.shape

(80000, 94)

In [128]:
lr_preds_test = lr.predict_proba(test)
xgb_preds_test = xgb.predict_proba(test)
rf_preds_test = rf.predict_proba(test)

In [129]:
preds_table_final = pd.DataFrame({"LR":lr_preds_test[:,1], "XGBoost":xgb_preds_test[:,1], "RandomForests":rf_preds_test[:,1]}, index=test.index)

### 5.2. Export pour test Kaggle

In [130]:
preds = lr_blending.predict_proba(preds_table_final)[:,1]

In [131]:
#preds = pd.DataFrame(preds)

In [132]:
submission_file = pd.read_csv("sampleSubmission.gz")

In [133]:
submission_file['click'] = preds

Dans le cas d'un export en .gz :

In [None]:
submission_file.to_csv(path_or_buf= './submission_files/preds_' + datetime.now().strftime("%d%m%Y-%H%M%S") + '.gz', index = False, sep = ',', compression='gzip')