# ALTEGRAD Challenge - Classification

*Abderrahim AIT-AZZI, SÃ©bastien OHLEYER, Mickael SUTTON*

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search
import matplotlib.pyplot as plt
import os
import csv
from datetime import datetime
from log import _check_log_directory,_initialise_model_log



In [2]:
data_dir = "./data/"
log_dir = './log'

In [3]:
#initialize model directory
log_name = (datetime.now().strftime('%d-%m-%Y_%H-%M-%S'))
log_filepath = os.path.join(log_dir,log_name,'lighgb.csv')
_check_log_directory(os.path.join(log_dir,log_name))
_initialise_model_log(log_filepath)

Attempting to make log directory at ./log/01-02-2018_19-31-39


### Load data & features

In [4]:
from load_features import load_features

In [5]:
features_train, features_test, data_train = load_features(data_dir)

### Create train and test matrices

In [6]:
X_train= features_train.drop(['is_duplicate'],axis=1)
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)
Y_train = data_train["is_duplicate"].values

In [7]:
X_train.columns

Index(['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
       'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
       'fuzz_WRatio',
       ...
       'num_o_q1', 'num_o_q2', 'num_h_q1', 'num_h_q2', 'num_s_q1', 'num_s_q2',
       'num_f_q1', 'num_f_q2', 'num_l_q1', 'num_l_q2'],
      dtype='object', length=158)

# Classifier LIGHTGB

In [8]:
from lgb_train import lgb_train

In [9]:
print('Number of features on train matrix: ',len(X_train.columns))
print('Number of features on test matrix: ',len(X_test.columns))

Number of features on train matrix:  158
Number of features on test matrix:  158


## A. Gridsearch

In [None]:
for num_leaves in [120,130,140]:
    for lambda_l2 in [1.8,1.9,2]:
        RANDOM_SEED = 2017
        lgb_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt',
            'device': 'cpu',
            'feature_fraction': 0.486,
            'num_leaves': num_leaves,
            'lambda_l2': lambda_l2,
            'learning_rate': 0.01,
            'num_boost_round': 5000,
            'early_stopping_rounds': 50,
            'max_depth': 25,
            'min_data_in_leaf': 15,
            'subsample': 1,
            'colsample_bytree': 1,
            'verbose': 1,
            'bagging_fraction_seed': RANDOM_SEED,
            'feature_fraction_seed': RANDOM_SEED,
        }
        print(lgb_params)
        lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath)

## B. Prediction

In [10]:
# Abderrahim best parameters
RANDOM_SEED = 2017
lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 140,
        'lambda_l2': 2,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 50,
        'max_depth': 25,
        'min_data_in_leaf': 15,
        'subsample': 1,
        'colsample_bytree': 1,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }

In [11]:
#lgb_params = {
#    'learning_rate': 0.005, 'colsample_bytree': 1, 'boosting': 'gbdt', 'feature_fraction': 0.486, 
#    'metric': 'binary_logloss', 'min_data_in_leaf': 15, 'verbose': 1, 'subsample': 1, 'bagging_fraction_seed': 2017, 
#    'objective': 'binary', 'num_leaves': 130, 'max_depth': 25, 'early_stopping_rounds': 50, 'lambda_l2': 1.5, 
#    'feature_fraction_seed': 2017, 'device': 'cpu', 'num_boost_round': 5000}

In [11]:
feat_imp = lgb_train(X_train, X_test, Y_train, lgb_params, log_filepath, test_prediction=True, num_folds=5)

Fitting fold {fold_num + 1} of {kfold.n_splits}




Fold 1: 1446 rounds, training loss 0.035507, validation loss 0.116387
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 2: 1427 rounds, training loss 0.035771, validation loss 0.121752
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 3: 1297 rounds, training loss 0.039282, validation loss 0.123209
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 4: 1422 rounds, training loss 0.035729, validation loss 0.121296
Fitting fold {fold_num + 1} of {kfold.n_splits}
Fold 5: 1096 rounds, training loss 0.045651, validation loss 0.127562
Final CV val score: [0.11638674224041103, 0.1217516816404173, 0.12320874240424384, 0.12129626367404682, 0.1275618977165923]
Final mean CV val score: 0.12204106553514227

Make submission file...
Submission file written !


In [13]:
feat_imp[feat_imp['column']=='shortest_path_weighted']

Unnamed: 0,column,importance
74,shortest_path_weighted,1662


## C. Manual CV

In [9]:
X_train_feat = X_train
X_train_values=X_train[:60000].values
y_train_values = Y_train[:60000]
X_fold_val = X_train[60000:].values
y_fold_val = Y_train[60000:]

In [11]:
len(X_train_values)

60000

In [14]:
import lightgbm as lgb
lgb_params = lgb_params.copy()

lgb_data_train = lgb.Dataset(X_train_values, y_train_values)
lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
evals_result = {}

model = lgb.train(
lgb_params,
lgb_data_train,
valid_sets=[lgb_data_train, lgb_data_val],
evals_result=evals_result,
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
verbose_eval=False,
)
fold_train_scores = evals_result['training'][lgb_params['metric']]
fold_val_scores = evals_result['valid_1'][lgb_params['metric']]



In [15]:
print(fold_train_scores[-1])
print(fold_val_scores[-1])

feat_imp = pd.DataFrame({
'column': list(X_train.columns),
'importance': model.feature_importance()}).sort_values(by='importance')

0.038801477712497756
0.12940948953510495


In [16]:
feat_imp[feat_imp['column']=='shortest_path_weighted']

Unnamed: 0,column,importance
74,shortest_path_weighted,2071
