<a href="https://colab.research.google.com/github/ZeyadSabbah/TrivagoRecommenderSystem/blob/master/FinalEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem

/content/drive/My Drive/Trivago/Project/TrivagoRecommenderSystem


## Loading Libraries & Datasets

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math
import matplotlib.pyplot as plt
from datetime import datetime
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [0]:
TrainMoreDataFilepath = './Datasets/clean_data/Sets/trainMore.csv'
valFilepath = './Datasets/clean_data/Sets/val.csv'
testFilepath = './Datasets/clean_data/Sets/test.csv'

TrainMoreData = pd.read_csv(TrainMoreDataFilepath)
valData = pd.read_csv(valFilepath)
testData = pd.read_csv(testFilepath)

## Validation & Test sets' Transformation & Scaling

### Preparation

In [0]:
#declaring features and label
features = TrainMoreData.drop(columns=['session_id', 'item_id', 'clickout']).columns.tolist()
label = ['clickout']

#dropping highly correlated features
FeaturesToDrop = ['NumberInImpressions', 'NumberInReferences', 'MeanPrice', 'MinPrice']
for feature in FeaturesToDrop:
  features.remove(feature)

X_train = TrainMoreData[features]
y_train = TrainMoreData[label]

valData_sessions_item = valData[['session_id', 'item_id', 'clickout']]
X_val = valData[features]
y_val = valData[label]

testData_sessions_item = testData[['session_id', 'item_id', 'clickout']]
X_test = testData[features]
y_test  = testData[label]

### Using SelectKBest

In [6]:
bestfeatures = SelectKBest(score_func=chi2, k=8)
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score'] 
print(featureScores.nlargest(8,'Score'))

                    Specs         Score
4           item_duration  6.554425e+08
6       item_interactions  1.329537e+07
10       NumberAsClickout  3.149896e+06
11  NumberAsFinalClickout  2.388846e+06
1               item_rank  2.196057e+06
0                   price  7.123832e+05
5   item_session_duration  6.273780e+05
8                top_list  5.268643e+05


### Scaling

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

#features by SelectKBest
features = ['item_duration','item_interactions','NumberAsClickout','NumberAsFinalClickout','item_rank','price','item_session_duration','top_list']
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
("num", num_pipeline, list(X_train[features]))
])

# training set scaling
X_train_scaled = full_pipeline.fit_transform(X_train[features])

# validation set scaling
X_val_scaled = full_pipeline.fit_transform(X_val[features])

# test set scaling
X_test_scaled = full_pipeline.fit_transform(X_test[features])

## Evaluation

In [0]:
#function is from this repo https://gist.github.com/bwhite/3726239
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

def get_probabilities(model_path, X, session_item_dataset):
  '''
  Desc: function that gets the probability of each item being selected by the user, rerank the items in the session based on the probabilites

  Input: model_path: String with the name of the stored model
         X: array of scaled features of the dataset
         session_item_dataset: Pandas Dataframe with the sessions, items, and clickout
        
  Output: clickout_rank: List of lists that carries which item was selected in which rank
          RecommendationsDF: Pandas Dataframe to be transformed and merged to the Clickout Dataframe
  '''
  model = joblib.load(model_path)
  BothProbabilities = model.predict_proba(X)
  Probabilities = [Probability[1] for Probability in BothProbabilities]
  session_item_dataset['probability'] = Probabilities
  RecommendationsDF = session_item_dataset.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))
  clickout_rank = RecommendationsDF.clickout
  clickout_rank = clickout_rank.reset_index().groupby('session_id').clickout.apply(list).values.tolist()
  return clickout_rank, RecommendationsDF

def ClassifReport(model_path, X, y):
  global y_pred
  model = joblib.load(model_path)
  y_pred = model.predict(X)
  return classification_report(y, y_pred)

def PrintMetrics(model_path, X, y, session_item_dataset):
  clickout_rank, RecommendationsDF = get_probabilities(model_path, X, session_item_dataset)
  MeanReciprocalRank = mean_reciprocal_rank(clickout_rank)
  print('Mean Reciprocal Rank : ', MeanReciprocalRank)
  print('=================================================')
  ClassificationReport = ClassifReport(model_path, X, y)
  print('Classification Report')
  print('=================================================')
  print(ClassificationReport)
  ConfMatrix = confusion_matrix(y, y_pred, labels=[1, 0])
  print('Confusion Matrix')
  print('================================================')
  print(ConfMatrix)
  return

In [0]:
def ConcatProperties(data, ClustersOrPCA, X_scaled):
  '''
  Desc: function that concatenates either clusters or PCA of properties

  Input: data: Pandas DataFrame with the data being used to concatenate properties to
         ClustersOrPCA: String with which type to be used of properties
         X_scaled: numpy array of the features of the dataframe used
         
  Output: X_scaled_prop: Pandas DataFrame with the properties of different items being concatenated to the features dataframe
  '''
  if ClustersOrPCA == 'Clusters':
    PropertiesFilePath = './Datasets/clean_data/ItemsPropertiesSimilarities/PropertiesClusters.csv'
    DF = pd.read_csv(PropertiesFilePath)
    DF = DF.rename(columns={'item':'item_id'})
  elif ClustersOrPCA == 'PCA':
    PropertiesFilePath = './Datasets/clean_data/ItemsPropertiesSimilarities/PCA_Properties.csv'
    DF = pd.read_csv(PropertiesFilePath)
    DF.drop(columns='Unnamed: 0', inplace=True)
  X_scaled = pd.DataFrame(X_scaled, columns=features)
  X_scaled['item_id'] = data['item_id'].values.tolist()
  X_scaled_prop = X_scaled.merge(DF, on='item_id', how='left')
  if ClustersOrPCA == 'Clusters':
    X_scaled_prop.fillna(0, inplace=True)
  elif ClustersOrPCA == 'PCA':
    X_scaled_prop.fillna(X_scaled_prop.mean(), inplace=True)
  X_scaled_prop.drop(columns='item_id', inplace=True)
  return X_scaled_prop

In [0]:
X_val_PCA = ConcatProperties(valData, 'PCA', X_val_scaled)
X_test_PCA = ConcatProperties(testData, 'PCA', X_test_scaled)

X_val_clust = ConcatProperties(valData, 'Clusters', X_val_scaled)
X_test_clust = ConcatProperties(testData, 'Clusters', X_test_scaled)

### PCA

In [0]:
PrintMetrics('./modelsProperties/RF_PCA.pkl', X_val_PCA, y_val, valData_sessions_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mean Reciprocal Rank :  0.6051184239562165
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   3249590
           1       0.75      0.05      0.09    149676

    accuracy                           0.96   3399266
   macro avg       0.85      0.52      0.54   3399266
weighted avg       0.95      0.96      0.94   3399266

Confusion Matrix
[[   7329  142347]
 [   2506 3247084]]


### Clusters

In [0]:
PrintMetrics('./modelsProperties/RF_Clusters.pkl', X_val_clust, y_val, valData_sessions_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mean Reciprocal Rank :  0.5933660964944267
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   3249590
           1       0.67      0.07      0.13    149676

    accuracy                           0.96   3399266
   macro avg       0.81      0.54      0.55   3399266
weighted avg       0.95      0.96      0.94   3399266

Confusion Matrix
[[  10754  138922]
 [   5309 3244281]]


## Final Score

In [12]:
PrintMetrics('./modelsProperties/RF_PCA.pkl', X_test_PCA, y_test, testData_sessions_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mean Reciprocal Rank :  0.609054711589517
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98   2663846
           1       0.75      0.05      0.09    122815

    accuracy                           0.96   2786661
   macro avg       0.85      0.52      0.53   2786661
weighted avg       0.95      0.96      0.94   2786661

Confusion Matrix
[[   5946  116869]
 [   1997 2661849]]


## Submission

In the future, the challenge might be a public competition, in that case transformation of the output is needed.

In [13]:
test_set_filepath = './Datasets/raw_data/test.csv'
test_set = pd.read_csv(test_set_filepath)
test_set.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,004A07DM0IDW,1d688ec168932,1541555614,1,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
1,004A07DM0IDW,1d688ec168932,1541555614,2,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
2,004A07DM0IDW,1d688ec168932,1541555696,3,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
3,004A07DM0IDW,1d688ec168932,1541555707,4,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
4,004A07DM0IDW,1d688ec168932,1541555717,5,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...


In [0]:
def transform_Recommendations(clickout_dataframe, RecommendationsDF):
  ListOfItems = RecommendationsDF.reset_index(drop=True)[['session_id', 'item_id']].groupby('session_id', sort=False).item_id.apply(pd.Series.tolist).tolist()
  SessionsListOfItems = pd.DataFrame({'session_id':RecommendationsDF.session_id.unique().tolist(),
                                      'item_recommendations':ListOfItems})
  SessionsListOfItems.item_recommendations = SessionsListOfItems.item_recommendations.apply(lambda x: ' '.join(x))
  data = clickout_dataframe.merge(SessionsListOfItems, on='session_id', how='left')
  return data
  
def get_probabilities_submission(model_path, X, session_item_dataset):
  '''
  Desc: function that gets the probability of each item being selected by the user, rerank the items in the session based on the probabilites

  Input: model_path: String with the name of the stored model
         X: array of scaled features of the dataset
         session_item_dataset: Pandas Dataframe with the sessions, items, and clickout
        
  Output: clickout_rank: List of lists that carries which item was selected in which rank
          RecommendationsDF: Pandas Dataframe to be transformed and merged to the Clickout Dataframe
  '''
  model = joblib.load(model_path)
  BothProbabilities = model.predict_proba(X)
  Probabilities = [Probability[1] for Probability in BothProbabilities]
  session_item_dataset['probability'] = Probabilities
  RecommendationsDF = session_item_dataset.groupby(['session_id'], sort=False).apply(lambda x: (x.sort_values('probability', ascending=False)))
  return RecommendationsDF

In [0]:
ListOfItems = Output.reset_index(drop=True)[['session_id', 'item_id']].groupby('session_id', sort=False).item_id.apply(pd.Series.tolist).tolist()
SessionsListOfItems = pd.DataFrame({'session_id':Output.session_id.unique().tolist(),
                                    'item_recommendations':ListOfItems})
SessionsListOfItems.item_recommendations = SessionsListOfItems.item_recommendations.apply(lambda x: ' '.join(x))
SessionsListOfItems.head()

In [0]:
from data_transformation import data_transformation

test_clickout = test_set[test_set.action_type=='clickout item'].groupby('session_id').tail(1)
test_clickout = test_clickout[['user_id', 'session_id', 'timestamp', 'step']]
test_set_transformed = data_transformation.transform_data(test_set)
test_session_item = test_set_transformed[['session_id', 'item_id']]
X_test_submission = test_set_transformed[features]
X_test_submission_scaled = full_pipeline.fit_transform(X_test_submission)
RecommendationsDF = get_probabilities_submission('./modelsProperties/RF_PCA.pkl', X_test_submission_scaled, test_session_item)
SubmissionDF = transform_Recommendations(test_clickout, RecommendationsDF)
SubmissionDF.head()

In [38]:
ConcatProperties(X_test_submission_scaled)

array([[-0.04855025,  0.63374487, -0.37165291, ..., -0.26967466,
        -0.09979741,  1.89355082],
       [-0.04855025, -0.11493457,  0.87157867, ..., -0.40391416,
        -0.09979741,  1.89355082],
       [-0.04855025, -0.11493457,  0.44706057, ..., -0.39272753,
        -0.09979741,  1.89355082],
       ...,
       [-0.04855025, -0.11493457, -0.12907114, ..., -0.17458835,
        -0.09979741, -0.52810835],
       [-0.04855025, -0.11493457, -0.12907114, ...,  0.20575688,
        -0.09979741, -0.52810835],
       [-0.04855025, -0.11493457, -0.0684257 , ...,  0.32880975,
        -0.09979741, -0.52810835]])

In [0]:
SubmissionDF.to_csv('./Datasets/clean_data/RecommendationsSubmission.csv', index=False)