# Feature Selection & Modeling
## Mounting to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem

/content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem


## Loading Libraries & Datasets

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import random
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

  import pandas.util.testing as tm


In [0]:
TrainDataFilepath = './Datasets/clean_data/Sets/train.csv'
TrainData = pd.read_csv(TrainDataFilepath)
TrainData.dropna(inplace=True)

#declaring features and label
features = TrainData.drop(columns=['session_id', 'item_id', 'clickout']).columns.tolist()
label = ['clickout']

FeaturesToDrop = ['NumberInImpressions', 'NumberInReferences', 'MeanPrice', 'MinPrice']
for feature in FeaturesToDrop:
  features.remove(feature)

X_train = TrainData[features]
y_train = TrainData[label]

TrainData_sessions_item = TrainData[['session_id', 'item_id', 'clickout']]

TrainData is ready for processing and modeling, while validation and test sets still need to be engineered. There is a ready function that will transform sets into the same form of TrainData.
## Scaling Features

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train))
])

X_train_scaled = full_pipeline.fit_transform(X_train)

## Properties

In [0]:
def ConcatProperties(data, ClustersOrPCA, X_scaled):
  '''
  Desc: function that concatenates either clusters or PCA of properties

  Input: data: Pandas DataFrame with the data being used to concatenate properties to
         ClustersOrPCA: String with which type to be used of properties
         X_scaled: numpy array of the features of the dataframe used
         
  Output: X_scaled_prop: Pandas DataFrame with the properties of different items being concatenated to the features dataframe
  '''
  if ClustersOrPCA == 'Clusters':
    PropertiesFilePath = './Datasets/clean_data/ItemsPropertiesSimilarities/PropertiesClusters.csv'
    DF = pd.read_csv(PropertiesFilePath)
    DF = DF.rename(columns={'item':'item_id'})
  elif ClustersOrPCA == 'PCA':
    PropertiesFilePath = './Datasets/clean_data/ItemsPropertiesSimilarities/PCA_Properties.csv'
    DF = pd.read_csv(PropertiesFilePath)
    DF.drop(columns='Unnamed: 0', inplace=True)
  X_scaled = pd.DataFrame(X_scaled, columns=features)
  X_scaled['item_id'] = data['item_id'].values.tolist()
  X_scaled_prop = X_scaled.merge(DF, on='item_id', how='left')
  if ClustersOrPCA == 'Clusters':
    X_scaled_prop.fillna(0, inplace=True)
  elif ClustersOrPCA == 'PCA':
    X_scaled_prop.fillna(X_scaled_prop.mean(), inplace=True)
  X_scaled_prop.drop(columns='item_id', inplace=True)
  return X_scaled_prop

 #### Clusters

In [0]:
X_train_Prop_clust = ConcatProperties(TrainData,'Clusters', X_train_scaled)
X_train_Prop_clust.head(2)

### PCA

In [13]:
X_train_Prop_PCA = ConcatProperties(TrainData, 'PCA', X_train_scaled)
X_train_Prop_PCA.head(2)

Unnamed: 0,price,item_rank,price_rank,session_duration,item_duration,item_session_duration,item_interactions,maximum_step,top_list,NumberOfProperties,NumberAsClickout,NumberAsFinalClickout,FClickoutToImpressions,FClickoutToReferences,FClickoutToClickout,MeanRank,MaxPrice,AveragePriceRank,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,0.252393,-1.600522,-0.482262,0.292265,-0.053845,-0.10928,-0.121224,-0.033207,1.892239,0.906953,0.285334,0.18286,-0.287569,-0.452165,0.050693,-0.046878,1.710171,0.175182,4.256183,-1.010829,0.194052,-0.343445,-0.483854,0.329065,1.082989
1,-0.547001,-1.46074,-1.46074,0.292265,-0.053845,-0.10928,-0.121224,-0.033207,1.892239,-0.566315,-0.197583,-0.216862,0.043631,-0.288805,0.064658,-0.37021,-0.134138,-0.583375,0.936076,0.037035,0.952187,0.495297,-0.157769,-0.341891,-0.289848


## XGBoost Modeling

Since XGBoost without Resampling reached the highest score, then this algorigthm will be the one functioning on both Properties clusters and PCA.

In [0]:
def random_search(clf, parameters, X, y):
  '''
  Desc: function that searches for the best hyperparameters for an algorithm by fitting randomly training example randomly and select the best

  Input: clf: classifier algorithm
         parameters: list of different parameters
  
  Output: best_clf: classifier with the best hyperparameters that suit this kind of data examples
  '''
  search_obj = RandomizedSearchCV(clf, parameters)
  fit_obj = search_obj.fit(X, y)
  best_clf = fit_obj.best_estimator_
  return best_clf

#### Clusters

In [0]:
X_train_Prop_clust_sample = X_train_Prop_clust.sample(frac=0.02, random_state=0)
y_train_sample = y_train.sample(frac=0.02, random_state=0)

t1 = datetime.now()
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate':[0.01, 0.1, 1, 10, 100]
}

best_clf_grid = random_search(gb, parameters, X_train_Prop_clust_sample, y_train_sample.values.ravel())
print(best_clf_grid)

gb = best_clf_grid.fit(X_train_Prop_clust, y_train.values.ravel())
joblib.dump(gb, './modelsProperties/XGBoost_clusters.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
Time taken :  4:25:56.523560


#### PCA

In [15]:
X_train_Prop_PCA_sample = X_train_Prop_PCA.sample(frac=0.02, random_state=0)
y_train_sample = y_train.sample(frac=0.02, random_state=0)

t1 = datetime.now()
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate':[0.01, 0.1, 1, 10, 100]
}

best_clf_grid = random_search(gb, parameters, X_train_Prop_PCA_sample, y_train_sample.values.ravel())
print(best_clf_grid)

gb = best_clf_grid.fit(X_train_Prop_PCA, y_train.values.ravel())
joblib.dump(gb, './modelsProperties/XGBoost_PCA.pkl')
t2 = datetime.now()
print('Time taken : ', (t2 - t1))

Time taken :  2:00:46.645657


## Model Evaluation

In [0]:
#function is from this repo https://gist.github.com/bwhite/3726239
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

In [0]:
def get_probabilities(model_path, X, session_item_dataset):
  '''
  Desc: function that gets the probability of each item being selected by the user, rerank the items in the session based on the probabilites

  Input: model_path: String with the name of the stored model
         X: array of scaled features of the dataset
         session_item_dataset: Pandas Dataframe with the sessions, items, and clickout
        
  Output: clickout_rank: List of lists that carries which item was selected in which rank
          RecommendationsDF: Pandas Dataframe to be transformed and merged to the Clickout Dataframe
  '''
  model = joblib.load(model_path)
  BothProbabilities = model.predict_proba(X)
  Probabilities = [Probability[1] for Probability in BothProbabilities]
  session_item_dataset['probability'] = Probabilities
  RecommendationsDF = session_item_dataset.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))
  clickout_rank = RecommendationsDF.clickout
  clickout_rank = clickout_rank.reset_index().groupby('session_id').clickout.apply(list).values.tolist()
  return clickout_rank, RecommendationsDF

In [0]:
def ClassifReport(model_path, X, y):
  global y_pred
  model = joblib.load(model_path)
  y_pred = model.predict(X)
  return classification_report(y, y_pred)

In [0]:
def PrintMetrics(model_path, X, y, session_item_dataset):
  clickout_rank, RecommendationsDF = get_probabilities(model_path, X, session_item_dataset)
  MeanReciprocalRank = mean_reciprocal_rank(clickout_rank)
  print('Mean Reciprocal Rank : ', MeanReciprocalRank)
  print('=================================================')
  ClassificationReport = ClassifReport(model_path, X, y)
  print('Classification Report')
  print('=================================================')
  print(ClassificationReport)
  ConfMatrix = confusion_matrix(y, y_pred)
  print('Confusion Matrix')
  print('================================================')
  print(ConfMatrix)
  return

### Loading and Preparing Validation & Test Sets

#### Validation

In [0]:
valFilepath = './Datasets/clean_data/Sets/val.csv'
valData = pd.read_csv(valFilepath)

#declaring features and label
features = valData.drop(columns=['session_id', 'item_id', 'clickout']).columns.tolist()
label = ['clickout']

FeaturesToDrop = ['NumberInImpressions', 'NumberInReferences', 'MeanPrice', 'MinPrice']
for feature in FeaturesToDrop:
  features.remove(feature)

valData_sessions_item = valData[['session_id', 'item_id', 'clickout']]
X_val = valData[features]
y_val = valData[label]

# validation set scaling
X_val_scaled = full_pipeline.fit_transform(X_val)

X_val_Prop_clust = ConcatProperties(valData, 'Clusters', X_val_scaled)
X_val_Prop_PCA = ConcatProperties(valData, 'PCA', X_val_scaled)

#### Test

In [0]:
testFilepath = './Datasets/clean_data/Sets/test.csv'
testData = pd.read_csv(testFilepath)

FeaturesToDrop = ['NumberInImpressions', 'NumberInReferences', 'MeanPrice', 'MinPrice']
for feature in FeaturesToDrop:
  features.remove(feature)

testData_sessions_item = testData[['session_id', 'item_id', 'clickout']]
X_test = testData[features]
y_test  = testData[label]

# test set scaling
X_test_scaled = full_pipeline.fit_transform(X_test)

X_test_Prop_clust = ConcatProperties(testData, 'Clusters', X_test_scaled)
X_test_Prop_PCA = ConcatProperties(testData, 'PCA', X_test_scaled)

### Clusters Evaluation

In [0]:
PrintMetrics('./modelsProperties/XGBoost_clusters.pkl', X_val_Prop_clust, y_val, valData_sessions_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Mean Reciprocal Rank :  0.5960285069151403
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   3249590
           1       0.63      0.14      0.23    149676

    accuracy                           0.96   3399266
   macro avg       0.80      0.57      0.61   3399266
weighted avg       0.95      0.96      0.95   3399266

Confusion Matrix
[[3237239   12351]
 [ 128260   21416]]


###PCA Evaluation

In [27]:
PrintMetrics('./modelsProperties/XGBoost_PCA.pkl', X_val_Prop_PCA, y_val, valData_sessions_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Mean Reciprocal Rank :  0.5960702061638072
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   3249590
           1       0.63      0.14      0.23    149676

    accuracy                           0.96   3399266
   macro avg       0.80      0.57      0.61   3399266
weighted avg       0.95      0.96      0.95   3399266

Confusion Matrix
[[3237239   12351]
 [ 128260   21416]]
