## Appending CSV files

In [72]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score

In [105]:
orig_folder = '../../data/CF_User/CFU_val_temp_preds_user/'
final_file = '../../data/CF_User/CFU_val_preds_user.csv'

In [106]:
file_paths = [os.path.join(orig_folder, fn) for fn in os.listdir(orig_folder)]

In [107]:
parts = [pd.read_csv(csv_file) for csv_file in file_paths]

In [108]:
final_df = pd.concat(parts)

In [109]:
final_df.to_csv(final_file, index=False)

In [110]:
len(final_df)

639494

## Joining CSV files

In [111]:
import pandas as pd
import numpy as np
import os

In [132]:
orig_folder = '../../data/test_stuff/'
final_file = '../../data/merged.csv'

In [113]:
file_paths = [os.path.join(orig_folder, fn) for fn in os.listdir(orig_folder)]

In [126]:
parts = [pd.read_csv(csv_file) for csv_file in file_paths]

In [127]:
merged_data = parts[0]

In [129]:
for part in parts[1:]:
    merged_data = pd.merge(left=merged_data, right=part, on=['movie_id', 'user_id', 'y_true'], validate='one_to_one')

In [130]:
len(merged_data)

625399

In [133]:
merged_data.to_csv(final_file, index=False)

## DF to PredictionHandler

In [139]:
class PredictionHandler(object):
    def __init__(self, ground_truth):
        self._predictions = {'ground_truth': ground_truth}
        self._num_preds = len(ground_truth)
        
    def add_prediction(self, model_name, predictions):
        if len(predictions) != self._num_preds:
            raise "Number of predictions different from the ground truth."
        self._predictions[model_name] = predictions
    
    def get_models_list(self):
        return list(self._predictions.keys())
    
    def get_predictions(self, model_name=None):
        if model_name and model_name in self._predictions:
            return self._predictions[model_name]
        else:
            return self._predictions


def df_to_prediction_handler(df):
    y_true = np.array(df['y_true'].values)
    predicted_df = df.drop(columns=['user_id', 'movie_id', 'y_true'])
    columns = predicted_df.columns
    
    prediction_handler = PredictionHandler(ground_truth=y_true)
    for model_name in columns:
        prediction_handler.add_prediction(model_name, 
                                          np.array(df[model_name].values))
        
    return prediction_handler

In [140]:
import pandas as pd
import numpy as np
import os

In [145]:
merged_handler = df_to_prediction_handler(merged_data)

3.22 ms ± 24.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [144]:
[len(merged_handler.get_predictions(model)) for model in merged_handler.get_models_list()]

[625399, 625399, 625399, 625399]

In [148]:
merged_handler.get_models_list()

['ground_truth', 'CF_User', 'CF_Item', 'LFM']

## Performance Analyzer

In [147]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, r2_score
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
class PerformanceAnalyzer(object):
    
    def __init__(self, prediction_handler: PredictionHandler, roc_thresholds=[3], 
                 rmse_thresholds=[0], r2_thresholds=[0]):
        self._prediction_handler = prediction_handler
        self._ground_truth_label = 'ground_truth'
        self._model_names = self._prediction_handler.get_models_list()
        self._model_names.remove(self._ground_truth_label)
        self._roc_thresholds = roc_thresholds
        self._rmse_thresholds = rmse_thresholds
        self._r2_thresholds = r2_thresholds
    
    def _euclidean_score(self, y_true, y_pred):
        return np.sqrt(np.sum(np.square(y_pred - y_true)))
    
    
    def _roc_auc_score(self, y_true, y_pred):
        threshold = 3
        if 'threshold' in self._kwargs:
            threshold = self._kwargs['threshold']
        
        # Threshold ground truth ratings
        y_true_thresh = np.where(y_true >= threshold, 1, 0)
        
        # Scale the predictions to bring them to 0-1 range from 0-5 range
        y_pred_scaled = y_pred / 5
        
        # Calculate the area under the ROC curve
        area = roc_auc_score(y_true_thresh, y_pred_scaled)
        
        return area
    
    
    def get_scores(self):
        if self._metric_name == 'euclidean':
            metric = self._euclidean_score
        else:
            metric = self._roc_auc_score
            
        scores = {}
        y_true = self._prediction_handler.get_predictions(self._ground_truth_label)
        
        for model_name in self._model_names:
            scores[model_name] = metric(y_true, 
                                        self._prediction_handler.get_predictions(model_name))
        return scores
    
    
    def get_models_list(self):
        return self._model_names
    