<a href="https://colab.research.google.com/github/ZeyadSabbah/TrivagoRecommenderSystem/blob/master/EvaluatingModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluating Models
## Mounting Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem

/content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem


## Loading Libraries & Datasets

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math
import matplotlib.pyplot as plt
from datetime import datetime
import re
import random
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [0]:
TrainDataFilepath = '/content/drive/My Drive/Trivago/Datasets/clean_data/TrainData.csv'
valFilepath = '/content/drive/My Drive/Trivago/Datasets/clean_data/val.csv'
testFilepath = '/content/drive/My Drive/Trivago/Datasets/clean_data/test.csv'

TrainData = pd.read_csv(TrainDataFilepath)
valData = pd.read_csv(valFilepath)
testData = pd.read_csv(testFilepath)

GlobalPath = '/content/drive/My Drive/Trivago/Datasets/clean_data/item_global.csv'
GlobalData = pd.read_csv(GlobalPath)
GlobalData.drop(columns=['Unnamed: 0', 'properties'], inplace=True)

## Validation & Test sets Scaling

In [0]:
#declaring features and label
features = ['price', 'item_rank', 'price_rank', 'session_duration', 'item_duration', 'item_session_duration', 'item_interactions', 'maximum_step', 'top_list',
            'NumberOfProperties', 'NumberInImpressions', 'NumberInReferences', 'NumberAsClickout', 'NumberAsFinalClickout', 'FClickoutToImpressions',
            'FClickoutToReferences', 'FClickoutToClickout', 'MeanPrice', 'AveragePriceRank']
label = ['clickout']
X_train = TrainData[features]
y_train = TrainData[label]

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train))
])

X_train_scaled = full_pipeline.fit_transform(X_train)

In [0]:
def get_data_clickout(data):
  data_clickout = data[data['action_type']=='clickout item'].groupby('session_id').tail(1)
  return data_clickout

def get_item_id(data_clickout):
  item_id = data_clickout[['session_id', 'impressions']]
  item_id['impressions'] = item_id['impressions'].apply(lambda x: x.split('|'))
  item_id = item_id.explode('impressions')
  item_id = item_id.rename(columns={'impressions':'item_id'})
  item_id = item_id.reset_index(drop=True)
  return item_id

def get_price(data_clickout):
  price = data_clickout[['session_id', 'prices']]
  price['prices'] = price['prices'].apply(lambda x: x.split('|'))
  price = price.explode('prices')
  price['prices'] = price['prices'].apply(lambda x: int(x))
  price = price.rename(columns={'prices':'price'})
  price = price.reset_index(drop=True)
  return price

def get_item_rank(data_clickout):
  item_rank = data_clickout[['session_id', 'impressions']]
  item_rank['impressions'] = item_rank['impressions'].apply(lambda x: x.split('|'))
  item_rank['impressions'] = item_rank['impressions'].apply(lambda x: list(range(1, len(x) + 1)))
  item_rank = item_rank.explode('impressions')
  item_rank = item_rank.rename(columns={'impressions':'item_rank'})
  item_rank = item_rank.reset_index(drop=True)
  return item_rank

def get_price_rank(data):
  price_rank = data.groupby('session_id', sort=False).price.apply(lambda x: x.values).to_frame().reset_index().rename(columns={'price':'price_list'})
  price_rank.price_list = price_rank.price_list.apply(lambda x: np.argsort(x))
  price_rank = price_rank.rename(columns={'price_list':'price_rank'})
  price_rank = price_rank.explode('price_rank')
  price_rank = price_rank.reset_index(drop=True)
  return price_rank

def get_clickout(data_clickout, item_id):
  clickout = data_clickout[['session_id','reference']]
  clickout = item_id.merge(clickout, on='session_id', how='left')
  clickout['clickout'] = clickout.apply(lambda x: 1 if x['item_id'] == x['reference'] else 0, axis=1)
  clickout.drop(columns='reference', inplace=True)
  clickout = clickout.reset_index(drop=True)
  return clickout

def get_session_duration(data, item_id):
  session_duration = data.groupby('session_id', sort=False).timestamp.max() - data.groupby('session_id', sort=False).timestamp.min()
  session_duration = session_duration.to_frame().rename(columns={'timestamp':'session_duration'})
  session_duration = item_id.merge(session_duration, on='session_id', how='left')
  session_duration.drop(columns='item_id', inplace=True)
  session_duration = session_duration.reset_index(drop=True)
  return session_duration

def get_item_duration(data, item_id):
  item_duration = data.groupby(['session_id', 'reference'], sort=False).timestamp.max() - data.groupby(['session_id', 'reference'], sort=False).timestamp.min()
  item_duration = item_duration.reset_index().rename(columns={'reference':'item_id', 'timestamp':'item_duration'})
  item_duration = item_id.merge(item_duration, left_on=['session_id', 'item_id'], right_on=['session_id', 'item_id'], how='left')
  item_duration = item_duration.fillna(0)
  item_duration = item_duration.reset_index(drop=True)
  return item_duration

def get_item_session_duration(item_duration, session_duration):
  item_duration['item_session_duration'] = item_duration.item_duration/session_duration.session_duration
  item_session_duration = item_duration[['session_id', 'item_id', 'item_session_duration']]
  item_duration = item_duration[['session_id', 'item_id', 'item_duration']]
  item_session_duration = item_session_duration.fillna(0)
  item_session_duration = item_session_duration.reset_index(drop=True)
  return item_session_duration

def get_item_interactions(data, item_id):
  item_interactions = data.groupby(['session_id', 'reference']).step.count().to_frame().reset_index()
  item_interactions = item_interactions.rename(columns={'reference':'item_id', 'step':'item_interactions'})
  item_interactions = item_id.merge(item_interactions, left_on=['session_id', 'item_id'], right_on=['session_id', 'item_id'], how='left')
  item_interactions = item_interactions.fillna(0)
  item_interactions = item_interactions.reset_index(drop=True)
  return item_interactions

def get_maximum_step(data, item_id):
  maximum_step = data.groupby('session_id', sort=False).step.max().to_frame().reset_index()
  maximum_step = maximum_step.rename(columns={'step':'maximum_step'})
  maximum_step = item_id.merge(maximum_step, on='session_id', how='left')
  maximum_step = maximum_step.reset_index(drop=True)
  return maximum_step

def get_top_list(item_rank):
  top_list = item_rank[['session_id', 'item_rank']]
  top_list['top_list'] = top_list.apply(lambda x: 1 if x['item_rank'] < 6 else 0, axis=1)
  top_list = top_list.reset_index(drop=True)
  return top_list

In [0]:
def transform_data(data):
  FinalClickoutDF = get_data_clickout(data)
  item_id = get_item_id(FinalClickoutDF)
  price = get_price(FinalClickoutDF)
  item_rank = get_item_rank(FinalClickoutDF)
  price_rank = get_price_rank(price)
  clickout = get_clickout(FinalClickoutDF, item_id)
  session_duration = get_session_duration(data, item_id)
  item_duration = get_item_duration(data, item_id)
  item_session_duration = get_item_session_duration(item_duration, session_duration)
  item_interactions = get_item_interactions(data, item_id)
  maximum_step = get_maximum_step(data, item_id)
  top_list = get_top_list(item_rank)
  
  local_data = item_id.copy()
  local_data['price'] = price.price
  local_data['item_rank'] = item_rank.item_rank
  local_data['price_rank'] = price_rank.price_rank
  local_data['clickout'] = clickout.clickout
  local_data['session_duration'] = session_duration.session_duration
  local_data['item_duration'] = item_duration.item_duration
  local_data['item_session_duration'] = item_session_duration.item_session_duration
  local_data['item_interactions'] = item_interactions.item_interactions
  local_data['maximum_step'] = maximum_step.maximum_step
  local_data['top_list'] = top_list.top_list
  GlobalPath = '/content/drive/My Drive/Trivago/Datasets/clean_data/item_global.csv'
  GlobalData = pd.read_csv(GlobalPath)
  GlobalData.item_id = GlobalData.item_id.apply(lambda x: str(x))
  data = local_data.merge(GlobalData, on='item_id', how='left')

  return data

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train))
]);

In [141]:
#validation set transformation and scaling
valData = transform_data(valData)
valData_sessions_item = valData[['session_id', 'item_id', 'clickout']]
X_val = valData[features]
y_val = valData[label]

X_val_scaled = full_pipeline.fit_transform(X_val)
X_val_scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation

array([[-0.1969163 , -1.60015524, -1.1807393 , ..., -0.06462864,
        -0.13854316,  0.65180304],
       [-0.18527795, -1.46034993,  0.07750851, ...,  0.14439283,
        -0.00522708,  0.16525878],
       [ 0.01839324, -1.32054461, -1.04093399, ..., -0.50589619,
         0.20896163,  0.67677923],
       ...,
       [ 0.4024589 ,  1.47556164,  0.77653508, ..., -0.27258231,
         1.38657493,  0.22429786],
       [ 3.12583359,  1.61536696, -0.48171274, ..., -0.30977728,
         1.39781544,  0.6446948 ],
       [-0.41804501,  1.75517227,  1.61536696, ..., -0.34059597,
         0.32688773, -0.39738175]])

In [132]:
#test set transformation and scaling
testData = transform_data(testData)
testData_sessions_item = testData[['session_id', 'item_id', 'clickout']]
X_test = testData[features]
y_test  = testData[label]

X_test_scaled = full_pipeline.fit_transform(X_test)
X_test_scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation

array([[ 0.61662666, -1.59967016, -1.45989388, ..., -0.48171317,
        -0.44047679,  0.56633078],
       [-0.55109476, -1.45989388, -1.18034133, ..., -1.40708634,
        -0.68811761, -1.06414473],
       [-0.34876679, -1.3201176 , -1.3201176 , ...,  0.21231671,
        -0.67841932, -2.9568702 ],
       ...,
       [ 0.39695629,  1.47540792,  1.75496048, ..., -0.36408099,
         1.44579983,  0.33335827],
       [ 0.28134031,  1.6151842 , -0.7610125 , ..., -1.40708634,
         1.40074567,  0.24833711],
       [ 1.07330979,  1.75496048,  0.07764516, ..., -0.90880848,
         2.0271444 ,  0.54135574]])

## Mean Reciprocal Rank
Mean Reciprocal Rank is a measure to evaluate systems that return a ranked list of answers to queries.

In [0]:
#function is from this page https://gist.github.com/bwhite/3726239
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

## Evaluating Models

In [0]:
testData_sessions_item

In [0]:
def get_probabilities(model_name, X, session_item_dataset):
  '''
  Desc: function that gets the probability of each item being selected by the user, rerank the items in the session based on the probabilites

  Input: model_path: String with the name of the stored model
         X: array of scaled features of the dataset
         session_item_dataset: Pandas Dataframe with the sessions, items, and clickout
        
  Output: clickout_rank: List of lists that carries which item was selected in which rank
  '''
  model = joblib.load(model_name)
  BothProbabilities = model.predict_proba(X)
  Probabilities = [Probability[1] for Probability in BothProbabilities]
  session_item_dataset['probability'] = Probabilities
  clickout_rank = session_item_dataset.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False))).clickout
  clickout_rank = clickout_rank.reset_index().groupby('session_id').clickout.apply(list).values.tolist()
  return clickout_rank

### Without Resampling

#### Logistic Regression

Just for clarification of what the get_probabilities function does, output of each step will be displayed, but for the next models, the function will be used.

In [0]:
LR_model = joblib.load('LR_model.pkl')
Predictions = LR_model.predict(X_val_scaled)
BothProbabilities = LR_model.predict_proba(X_val_scaled)
Probabilities = [Probability[1] for Probability in BothProbabilities]
Probabilities

In [39]:
valData_sessions_item['probability'] = Probabilities
valData_sessions_item.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,session_id,item_id,clickout,probability
0,06e7c29170946,10091602,0,0.818978
1,06e7c29170946,6625240,0,0.06236
2,06e7c29170946,9386776,0,0.038675
3,06e7c29170946,3954788,0,0.046309
4,06e7c29170946,9776792,0,0.083385


In [49]:
valData_sessions_item.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))

Unnamed: 0_level_0,Unnamed: 1_level_0,session_id,item_id,clickout,probability
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0000be39860d7,1732864,0000be39860d7,79237,1,0.901286
0000be39860d7,1732860,0000be39860d7,445081,0,0.234337
0000be39860d7,1732845,0000be39860d7,1221442,0,0.116164
0000be39860d7,1732850,0000be39860d7,3153188,0,0.073220
0000be39860d7,1732854,0000be39860d7,4920008,0,0.070863
...,...,...,...,...,...
s8u671odd7ckk,1266589,s8u671odd7ckk,9786364,0,0.002400
s8u671odd7ckk,1266590,s8u671odd7ckk,10259580,0,0.002365
s8u671odd7ckk,1266591,s8u671odd7ckk,6620598,0,0.002022
s8u671odd7ckk,1266592,s8u671odd7ckk,2325618,0,0.001707


In [83]:
clickout_rank = valData_sessions_item.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False))).clickout
clickout_rank = clickout_rank.reset_index().groupby('session_id').clickout.apply(list).values.tolist()
clickout_rank[0:5]

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]