<a href="https://colab.research.google.com/github/ZeyadSabbah/TrivagoRecommenderSystem/blob/master/EvalParVal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluating Baseline Models
Evaluating many baseline models that had been created on the validation set in order to get the best hyper-parameters to be proceeded with.

## Mounting Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd /content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem

/content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem


## Loading Libraries & Datasets

In [0]:
import pandas as pd
import numpy as np
import joblib
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
pd.set_option('mode.chained_assignment', None)

In [0]:
TrainDataFilepath = './Datasets/clean_data/Sets/train.csv'
valFilepath = './Datasets/clean_data/Sets/val.csv'
testFilepath = './Datasets/clean_data/Sets/test.csv'

TrainData = pd.read_csv(TrainDataFilepath)
valData = pd.read_csv(valFilepath)
testData = pd.read_csv(testFilepath)

## Validation & Test sets' Transformation & Scaling

### Preparation

In [0]:
#declaring features and label
features = TrainData.drop(columns=['session_id', 'item_id', 'clickout']).columns.tolist()
label = ['clickout']

#dropping highly correlated features
FeaturesToDrop = ['NumberInImpressions', 'NumberInReferences', 'MeanPrice', 'MinPrice']
for feature in FeaturesToDrop:
  features.remove(feature)

X_train = TrainData[features]
y_train = TrainData[label]

valData_sessions_item = valData[['session_id', 'item_id', 'clickout']]
X_val = valData[features]
y_val = valData[label]

testData_sessions_item = testData[['session_id', 'item_id', 'clickout']]
X_test = testData[features]
y_test  = testData[label]

### Scaling

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

features = ['item_duration','item_interactions','NumberAsClickout','NumberAsFinalClickout','item_rank','price',
            'item_session_duration','top_list']
            
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train[features]))
])

# training set scaling
X_train_scaled = full_pipeline.fit_transform(X_train[features])

# validation set scaling
X_val_scaled = full_pipeline.transform(X_val[features])

# test set scaling
X_test_scaled = full_pipeline.transform(X_test[features])

## Helper Functions

In [0]:
#function is from this repo https://gist.github.com/bwhite/3726239
def mean_reciprocal_rank(rs):
  """Score is reciprocal of the rank of the first relevant item
  First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
  Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
  >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
  >>> mean_reciprocal_rank(rs)
  0.61111111111111105
  >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
  >>> mean_reciprocal_rank(rs)
  0.5
  >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
  >>> mean_reciprocal_rank(rs)
  0.75
  Args:
      rs: Iterator of relevance scores (list or numpy) in rank order
          (first element is the first item)
  Returns:
      Mean reciprocal rank
  """
  rs = (np.asarray(r).nonzero()[0] for r in rs)
  return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

def get_probabilities(model_path, X, session_item_dataset):
  global clickout_rank, RecommendationsDF
  '''
  Desc: function that gets the probability of each item being selected by the user, rerank the items in the session based on the probabilites

  Input: model_path: String with the name of the stored model
         X: array of scaled features of the dataset
         session_item_dataset: Pandas Dataframe with the sessions, items, and clickout
        
  Output: clickout_rank: List of lists that carries which item was selected in which rank
          RecommendationsDF: Pandas Dataframe to be transformed and merged to the Clickout Dataframe
  '''
  model = joblib.load(model_path)
  BothProbabilities = model.predict_proba(X)
  Probabilities = [Probability[1] for Probability in BothProbabilities]
  session_item_dataset['probability'] = Probabilities
  RecommendationsDF = session_item_dataset.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))
  clickout_rank = RecommendationsDF.clickout
  clickout_rank = clickout_rank.reset_index().groupby('session_id').clickout.apply(list).values.tolist()
  return clickout_rank, RecommendationsDF
  
def ClassifReport(model_path, X, y):
  global y_pred
  model = joblib.load(model_path)
  y_pred = model.predict(X)
  return classification_report(y, y_pred)

def PrintMetrics(model_path, X, y, session_item_dataset):
  clickout_rank, RecommendationsDF = get_probabilities(model_path, X, session_item_dataset)
  MeanReciprocalRank = mean_reciprocal_rank(clickout_rank)
  print('Mean Reciprocal Rank : ', MeanReciprocalRank)
  print('=================================================')
  ClassificationReport = ClassifReport(model_path, X, y)
  print('Classification Report')
  print('=================================================')
  print(ClassificationReport)
  ConfMatrix = confusion_matrix(y, y_pred, labels=[1, 0])
  print('Confusion Matrix')
  print('================================================')
  print(ConfMatrix)
  return

def reshape(data, y):
  id_dict = Counter(data.session_id.values)
  id = []
  i = 0
  for session in data.session_id.unique():
    for j in range(id_dict.get(session)):
      id.append(i)
    i = i + 1

  y_reshaped = y.clickout.values.reshape((len(y), 1))
  gp_reshaped = np.array(id).reshape(len(id), 1)
  y_gp_reshaped = np.hstack((y_reshaped, gp_reshaped))
  return y_gp_reshaped

def MRR1(model_path, X, y):
    '''
    Works only for hyper-parameter tuning using validation set.
    '''
    
    estimator = joblib.load(model_path)
    probability = estimator.predict_proba(X)[:,1]
    df = pd.DataFrame(y, columns=['clickout', 'session_id'])
    df['probability'] = probability
    RecommendationsDF = df.groupby('session_id', sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))
    clickout_rank = RecommendationsDF.clickout
    clickout_rank = clickout_rank.reset_index().groupby('session_id', sort=False).clickout.apply(list).values.tolist()
    return mean_reciprocal_rank(clickout_rank)

## Reshaping Validation and Test Sets

In [0]:
y_val_reshaped = reshape(valData, y_val)
y_test_reshaped = reshape(testData, y_test)

## Evaluation

### Without Resampling

#### Logistic Regression

In [0]:
C_list = [0.001, 0.01, 0.1, 1, 10]
MRRlist = []

for C in C_list:
  print('C equals ', C)
  MeanRecipRank = MRR1('./modelsNew/LR_modelC{}.pkl'.format(C), X_val_scaled, y_val_reshaped)
  MRRlist.append(MeanRecipRank)
  print('MRR equals ', MeanRecipRank)
  print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(C_list[MaxIndex], 'is the best parameter.')

C equals  0.001
MRR equals  0.5318410105953008
C equals  0.01
MRR equals  0.5306751365918466
C equals  0.1
MRR equals  0.5304636523505768
C equals  1
MRR equals  0.5304455279884333
C equals  10
MRR equals  0.5304451940584785
0.001 is the best parameter.


#### Random Forest

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
parameters = []
MRRlist = []

for depth in max_depth:
  for n_estimator in n_estimators:
    params = [depth, n_estimator]
    parameters.append(params)
    print('max_depth equals {} and n_estimators equals {}'.format(depth, n_estimator))
    MeanRecipRank = MRR1('./modelsNew/RF_modelDepth{}N{}.pkl'.format(depth, n_estimator), X_val_scaled, y_val_reshaped)
    MRRlist.append(MeanRecipRank)
    print('MRR equals ', MeanRecipRank)
    print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1 and n_estimators equals 50
MRR equals  0.611747973269104
max_depth equals 1 and n_estimators equals 100
MRR equals  0.6117671328141076
max_depth equals 1 and n_estimators equals 150
MRR equals  0.6122438852852569
max_depth equals 1 and n_estimators equals 200
MRR equals  0.6118195583587548
max_depth equals 3 and n_estimators equals 50
MRR equals  0.5927865421547103
max_depth equals 3 and n_estimators equals 100
MRR equals  0.5991117102435418
max_depth equals 3 and n_estimators equals 150
MRR equals  0.5975274725887917
max_depth equals 3 and n_estimators equals 200
MRR equals  0.5936963104398567
max_depth equals 5 and n_estimators equals 50
MRR equals  0.5671153015802382
max_depth equals 5 and n_estimators equals 100
MRR equals  0.5694231303643901
max_depth equals 5 and n_estimators equals 150
MRR equals  0.5760390538438077
max_depth equals 5 and n_estimators equals 200
MRR equals  0.5694129771450702
[1, 150] are the best parameters.


####XGBoost

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
learning_rate = [0.01, 0.1, 1]
parameters = []
MRRlist = []

for depth in max_depth:
  for l_rate in learning_rate:
    for n_estimator in n_estimators:
      params = [depth, n_estimator, l_rate]
      parameters.append(params)
      print('max_depth equals {}, n_estimators equals {}, and learning_rate equals {}'.format(depth, n_estimator, l_rate))
      MeanRecipRank = MRR1('./modelsNew/XGB_modelDepth{}N{}Lrate{}.pkl'.format(depth, n_estimator, l_rate),  X_val_scaled, y_val_reshaped)
      MRRlist.append(MeanRecipRank)
      print('MRR equals ', MeanRecipRank)
      print('=============================================================================================================')


MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.01
MRR equals  0.6110881854054147
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.01
MRR equals  0.5862393040404837
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.01
MRR equals  0.5833116363622134
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.01
MRR equals  0.5980052611133977
max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.1
MRR equals  0.5888656732646322
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.1
MRR equals  0.5739418775305741
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.1
MRR equals  0.5754199781650972
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.1
MRR equals  0.5703137581674497
max_depth equals 1, n_estimators equals 50, and learning_rate equals 1
MRR equals  0.3119403803738658
max_depth equals 1, n_estimators equals 100, and learnin

### With SMOTE

####Logistic Regression

In [0]:
C_list = [0.001, 0.01, 0.1, 1, 10]
MRRlist = []

for C in C_list:
  print('C equals ', C)
  MeanRecipRank = MRR1('./modelsNew/LR_ SMOTE_C{}.pkl'.format(C), X_val_scaled, y_val_reshaped)
  MRRlist.append(MeanRecipRank)
  print('MRR equals ', MeanRecipRank)
  print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(C_list[MaxIndex], 'is the best parameter.')

C equals  0.001
MRR equals  0.5328501467903882
C equals  0.01
MRR equals  0.5324270788816763
C equals  0.1
MRR equals  0.5323699178810473
C equals  1
MRR equals  0.532366528708843
C equals  10
MRR equals  0.5323661947788882
0.001 is the best parameter.


#### Random Forest

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
parameters = []
MRRlist = []

for depth in max_depth:
  for n_estimator in n_estimators:
    params = [depth, n_estimator]
    parameters.append(params)
    print('max_depth equals {} and n_estimators equals {}'.format(depth, n_estimator))
    MeanRecipRank = MRR1('./modelsNew/RF_SMOTEDepth{}N{}.pkl'.format(depth, n_estimator), X_val_scaled, y_val_reshaped)
    MRRlist.append(MeanRecipRank)
    print('MRR equals ', MeanRecipRank)
    print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1 and n_estimators equals 50
MRR equals  0.5856364029778319
max_depth equals 1 and n_estimators equals 100
MRR equals  0.6076981288711684
max_depth equals 1 and n_estimators equals 150
MRR equals  0.587638927961783
max_depth equals 1 and n_estimators equals 200
MRR equals  0.6039257767788645
max_depth equals 3 and n_estimators equals 50
MRR equals  0.5623632325865033
max_depth equals 3 and n_estimators equals 100
MRR equals  0.5700822787258082
max_depth equals 3 and n_estimators equals 150
MRR equals  0.5636586769680847
max_depth equals 3 and n_estimators equals 200
MRR equals  0.5652971824128564
max_depth equals 5 and n_estimators equals 50
MRR equals  0.5472226918247024
max_depth equals 5 and n_estimators equals 100
MRR equals  0.5484396099283035
max_depth equals 5 and n_estimators equals 150
MRR equals  0.5454754620080117
max_depth equals 5 and n_estimators equals 200
MRR equals  0.5521173636990838
[1, 100] are the best parameters.


#### XGBoost

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
learning_rate = [0.01, 0.1]
parameters = []
MRRlist = []

for depth in max_depth:
  for l_rate in learning_rate:
    for n_estimator in n_estimators:
      params = [depth, n_estimator, l_rate]
      parameters.append(params)
      print('max_depth equals {}, n_estimators equals {}, and learning_rate equals {}'.format(depth, n_estimator, l_rate))
      MeanRecipRank = MRR1('./modelsNew/XGB_SMOTEDepth{}N{}Lrate{}.pkl'.format(depth, n_estimator, l_rate),  X_val_scaled, y_val_reshaped)
      MRRlist.append(MeanRecipRank)
      print('MRR equals ', MeanRecipRank)
      print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.01
MRR equals  0.5804530125214199
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.01
MRR equals  0.5839781772171048
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.01
MRR equals  0.5840628285541019
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.01
MRR equals  0.5993176217470989
max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.1
MRR equals  0.5897393712829533
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.1
MRR equals  0.5747752643548094
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.1
MRR equals  0.5758605340206633
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.1
MRR equals  0.5699296847629534
max_depth equals 3, n_estimators equals 50, and learning_rate equals 0.01
MRR equals  0.5898522451692539
max_depth equals 3, n_estimators equals 100, and lear

### With Undersampling

#### Logistic Regression

In [0]:
C_list = [0.001, 0.01, 0.1, 1, 10]
MRRlist = []

for C in C_list:
  print('C equals ', C)
  MeanRecipRank = MRR1('./modelsNew/LR_Usample_C{}.pkl'.format(C), X_val_scaled, y_val_reshaped)
  MRRlist.append(MeanRecipRank)
  print('MRR equals ', MeanRecipRank)
  print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(C_list[MaxIndex], 'is the best parameter.')

C equals  0.001
MRR equals  0.46363420524700205
C equals  0.01
MRR equals  0.47960528443635314
C equals  0.1
MRR equals  0.4847673909822967
C equals  1
MRR equals  0.4867548034671563
C equals  10
MRR equals  0.4874620608041104
10 is the best parameter.


#### Random Forest

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
parameters = []
MRRlist = []

for depth in max_depth:
  for n_estimator in n_estimators:
    params = [depth, n_estimator]
    parameters.append(params)
    print('max_depth equals {} and n_estimators equals {}'.format(depth, n_estimator))
    MeanRecipRank = MRR1('./modelsNew/RF_UsampleDepth{}N{}.pkl'.format(depth, n_estimator), X_val_scaled, y_val_reshaped)
    MRRlist.append(MeanRecipRank)
    print('MRR equals ', MeanRecipRank)
    print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1 and n_estimators equals 50
MRR equals  0.4943476224502757
max_depth equals 1 and n_estimators equals 100
MRR equals  0.4890958044937044
max_depth equals 1 and n_estimators equals 150
MRR equals  0.49756807636010514
max_depth equals 1 and n_estimators equals 200
MRR equals  0.49346429872679504
max_depth equals 3 and n_estimators equals 50
MRR equals  0.4725754459420281
max_depth equals 3 and n_estimators equals 100
MRR equals  0.45667041156421473
max_depth equals 3 and n_estimators equals 150
MRR equals  0.4673428149298242
max_depth equals 3 and n_estimators equals 200
MRR equals  0.491201029921567
max_depth equals 5 and n_estimators equals 50
MRR equals  0.4307510975578249
max_depth equals 5 and n_estimators equals 100
MRR equals  0.4465253361037403
max_depth equals 5 and n_estimators equals 150
MRR equals  0.43807166119628893
max_depth equals 5 and n_estimators equals 200
MRR equals  0.4377866955721171
[1, 150] are the best parameters.


#### XGBoost

In [0]:
n_estimators = [50, 100, 150, 200]
max_depth = [1, 3, 5]
learning_rate = [0.01, 0.1, 1]
parameters = []
MRRlist = []

for depth in max_depth:
  for l_rate in learning_rate:
    for n_estimator in n_estimators:
      params = [depth, n_estimator, l_rate]
      parameters.append(params)
      print('max_depth equals {}, n_estimators equals {}, and learning_rate equals {}'.format(depth, n_estimator, l_rate))
      MeanRecipRank = MRR1('./modelsNew/XGB_UsampleDepth{}N{}Lrate{}.pkl'.format(depth, n_estimator, l_rate),  X_val_scaled, y_val_reshaped)
      MRRlist.append(MeanRecipRank)
      print('MRR equals ', MeanRecipRank)
      print('=============================================================================================================')

MaxIndex = MRRlist.index(max(MRRlist))
print(parameters[MaxIndex], 'are the best parameters.')

max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.01
MRR equals  0.4762779282479144
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.01
MRR equals  0.5005629261813699
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.01
MRR equals  0.502264284227414
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.01
MRR equals  0.5067744841930498
max_depth equals 1, n_estimators equals 50, and learning_rate equals 0.1
MRR equals  0.5036524803434653
max_depth equals 1, n_estimators equals 100, and learning_rate equals 0.1
MRR equals  0.5074465807309867
max_depth equals 1, n_estimators equals 150, and learning_rate equals 0.1
MRR equals  0.5036193956658606
max_depth equals 1, n_estimators equals 200, and learning_rate equals 0.1
MRR equals  0.5005487520105201
max_depth equals 1, n_estimators equals 50, and learning_rate equals 1
MRR equals  0.49640169288155656
max_depth equals 1, n_estimators equals 100, and learnin

## Conclusion

Random Forest has the best performance on the validation set with mean reciprocal rank of 0.61224, beating XGBoost with slight difference. The next step would be adding working more on the data provided.