<a href="https://colab.research.google.com/github/ZeyadSabbah/TrivagoRecommenderSystem/blob/master/FeatureSelection%26Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Selection & Modeling
## Mounting to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem

/content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem


## Loading Libraries & Datasets

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math
import matplotlib.pyplot as plt
from datetime import datetime
import re
import random
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [0]:
TrainDataFilepath = 'Datasets/clean_data/Sets/train.csv'
TrainData = pd.read_csv(TrainDataFilepath)

## Using SelectKBest

TrainData has 20 features. Starting with just 15 important features to see the importance between the different features.

In [0]:
#declaring features and label
X_train = TrainData[['price', 'item_rank', 'price_rank', 'session_duration', 'item_duration', 'item_session_duration', 'item_interactions', 'maximum_step', 'top_list', 'NumberOfProperties',
       'NumberInImpressions', 'NumberInReferences', 'NumberAsClickout', 'NumberAsFinalClickout', 'FClickoutToImpressions', 'FClickoutToReferences', 'FClickoutToClickout', 'MeanPrice',
       'AveragePriceRank']]
y_train = TrainData[['clickout']]

In [0]:
bestfeatures = SelectKBest(score_func=chi2, k=15)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))  #print 10 best features

                     Specs         Score
4            item_duration  8.882146e+08
6        item_interactions  2.450741e+07
11      NumberInReferences  1.650767e+07
12        NumberAsClickout  4.326980e+06
13   NumberAsFinalClickout  3.261157e+06
1                item_rank  2.560008e+06
5    item_session_duration  2.545062e+06
0                    price  8.438415e+05
8                 top_list  6.153390e+05
17               MeanPrice  2.273722e+05
10     NumberInImpressions  1.314937e+05
14  FClickoutToImpressions  1.062536e+05
9       NumberOfProperties  7.866433e+04
15   FClickoutToReferences  6.643049e+04
16     FClickoutToClickout  6.460986e+04


Scores are high which means the features are relevant to the output, so all of the features will be taken in modeling at first, and then by removing the least important features, the performance of the model will be captured.

TrainData is ready for processing and modeling, while validation and test sets still need to be engineered. There is a ready function that will transform sets into the same form of TrainData.
## Scaling Features

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train))
])

X_train_scaled = full_pipeline.fit_transform(X_train)

## Models

###Without Resampling

In [0]:
def print_results(results):
  print('Best Prams: {}\n'.format(results.best_params_))

  means = results.cv_results_['mean_test_score']
  stds = results.cv_results_['std_test_score']
  for mean, std, params in zip(means, stds, results.cv_results_['params']):
    print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

#### Logistic Regression

In [0]:
t1  = datetime.now()
lr = LogisticRegression()
parameters = {
    'C':[0.01, 0.1, 1, 10]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train_scaled, y_train.values.ravel())

print_results(cv)
joblib.dump(cv.best_estimator_, 'LR_model.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

Best Prams: {'C': 10}

0.965 (+/-0.0) for {'C': 0.01}
0.965 (+/-0.0) for {'C': 0.1}
0.965 (+/-0.0) for {'C': 1}
0.965 (+/-0.0) for {'C': 10}
Time taken :  0:25:42.234537


#### SVM

Training SVM model can take long time with this huge dataset, so this step will be skipped.

In [0]:
t1  = datetime.now()
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C':[0.1, 1, 10]
}
cv = GridSearchCV(svc, parameters, cv = 5)
cv.fit(X_train_scaled, y_train.values.ravel())

print_results(cv)
joblib.dump(cv.best_estimator_, 'SVC_model.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

In [0]:
svc = SVC(C=10, kernel='rbf')
svc.fit(X_train_scaled, y_train.values.ravel())
print('done')

#### Random Forest

In [0]:
t1 = datetime.now()
rf = RandomForestClassifier()
parameters = {
    'n_estimators':[5, 50, 250],
    'max_depth':[2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train_scaled, y_train.values.ravel())

print_results(cv)
joblib.dump(cv.best_estimator_, 'RF_model.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

#### XGBoost

In [0]:
t1 = datetime.now()
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate':[0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_train_scaled, y_train.values.ravel())

print_results(cv)
joblib.dump(cv.best_estimator_, 'XGB_model.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

### With SMOTE

In [0]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 0, ratio = 1)
X_SM, y_SM = sm.fit_sample(X_train_scaled, y_train)

Only the same length of the original training set will be randomly selected, so it would not take long time training

In [0]:
NumberOfExamples = len(X_train_scaled)
SM_sample_indeces = np.random.choice(X_SM.index, NumberOfExamples, replace=False)

X_SM_sample = []
y_SM_sample = []
for Index in SM_sample_indeces:
  X_SM_sample.append(X_SM[Index])
  y_SM_sample.append(y_SM[Index])

#### Logistic Regression

In [0]:
t1  = datetime.now()
lr = LogisticRegression()
parameters = {
    'C':[0.01, 0.1, 1, 10]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_SM_sample, y_SM_sample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'LR_modelSMOTE.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

#### SVM
It will take long time to be trained, so it will be skipped here also, SVM will be used in undersampling method as it has less number of examples.

#### Random Forest

In [0]:
t1 = datetime.now()
rf = RandomForestClassifier()
parameters = {
    'n_estimators':[5, 50, 250],
    'max_depth':[2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_SM_sample, y_SM_sample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'RF_modelSMOTE.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

#### XGBoost

In [0]:
t1 = datetime.now()
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate':[0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_SM_sample, y_SM_sample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'XGB_modelSMOTE.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

### With Undersampling

In [0]:
ClickoutLen = len(y_train[y_train.clickout==1])
NotClickoutIndices = y_train[y_train.clickout==0].index
ClickoutIndices = y_train[y_train.clickout==1].index
NotClickoutRandomIndices = np.random.choice(NotClickoutIndices, ClickoutLen, replace=False)
UnderSampleIndices = np.concatenate([ClickoutIndices, NotClickoutRandomIndices])

XUnderSample = []
yUnderSample = []
for Index in UnderSampleIndices.tolist():
  XUnderSample.append(X_train_scaled[Index].tolist())
  yUnderSample.append([y_train.clickout.values[Index]])
XUndderSample = np.array(XUnderSample)
yUnderSample = np.array(yUnderSample)

In [0]:
XUnderSample = np.load('UnderSampleXy/XUnderSample.npy')
yUnderSample = np.load('UnderSampleXy/yUnderSample.npy')

#### Logistic Regression

In [0]:
t1  = datetime.now()
lr = LogisticRegression()
parameters = {
    'C':[0.01, 0.1, 1, 10]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(XUnderSample, yUnderSample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'LR_modelUndersampling.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

#### SVM

In [0]:
t1  = datetime.now()
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C':[0.1, 1, 10]
}
cv = GridSearchCV(svc, parameters, cv = 5)
cv.fit(XUnderSample, yUnderSample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'SVC_modelUndersampling.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

  y = column_or_1d(y, warn=True)


#### Random Forest

In [0]:
t1 = datetime.now()
rf = RandomForestClassifier()
parameters = {
    'n_estimators':[5, 50, 250],
    'max_depth':[2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(XUnderSample, yUnderSample)

print_results(cv)
joblib.dump(cv.best_estimator_, 'RF_modelUnderSampling.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

#### XGBoost

In [0]:
t1 = datetime.now()
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate':[0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(XUnderSample, yUnderSample.values.ravel())

print_results(cv)
joblib.dump(cv.best_estimator_, 'XGB_model.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))