# Model - Baseline

In [2]:
import sys
sys.path.append("../")

In [3]:
import numpy as np
import pandas as pd
from IPython.display import SVG
import matplotlib.pyplot as plt
import seaborn as sns

from reco.preprocess import encode_user_item, random_split, user_split

In [4]:
%matplotlib inline

## Prepare Data

In [5]:
df_ratings = pd.read_csv("data/ratings.csv")

In [6]:
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
DATA = encode_user_item(df_ratings, "user_id", "movie_id", "rating", "unix_timestamp")

Number of users:  943
Number of items:  1682


In [8]:
DATA.head()

Unnamed: 0,user_id,movie_id,RATING,TIMESTAMP,USER,ITEM
0,196,242,3,881250949,195,241
1,186,302,3,891717742,185,301
2,22,377,1,878887116,21,376
3,244,51,2,880606923,243,50
4,166,346,1,886397596,165,345


In [40]:
# Data Splitting
#train, test = user_split(DATA, "TIMESTAMP", "USER", [0.75, 0.25])
train, test = random_split(DATA, [0.75, 0.25])

In [41]:
train.shape, test.shape

((75000, 7), (25000, 7))

## Build & Train Model 

- Baseline (Rating): We will use each user's **mean rating** as the baseline prediction.
- Baseline (Ranking): We will use the **top-k popular items** as the baseline ranking


In [42]:
def average_rating_model(train):
    users_ratings = train.groupby(["USER"])["RATING"].mean()
    users_ratings = users_ratings.reset_index()
    users_ratings.rename(columns = {'RATING': 'RATING_PRED'}, inplace = True)

    return users_ratings

In [43]:
def popular_item_model(train):
    item_counts = (train.groupby("ITEM")
                   .count()
                   .reset_index()
                   .sort_values(ascending = False, by = "USER"))
    item_counts = item_counts[["ITEM", "USER"]]
    item_counts.columns = ['ITEM', 'RATING_PRED']
        
    return item_counts

### Generate Prediction: Rating & Ranking

In [44]:
predictions_ratings = average_rating_model(train)
predictions_ratings.head()

Unnamed: 0,USER,RATING_PRED
0,0,3.586066
1,1,3.672727
2,2,2.78
3,3,4.380952
4,4,2.847134


In [45]:
predictions_ranking = popular_item_model(train)
predictions_ranking.head()

Unnamed: 0,ITEM,RATING_PRED
49,49,437
180,180,386
257,257,385
287,287,373
99,99,372


## Evaluate Rating & Ranking 

In [46]:
rating_evaluate_df = pd.merge(test, predictions_ratings, on=['USER'], how='inner')
rating_evaluate_df.head()

Unnamed: 0,user_id,movie_id,RATING,TIMESTAMP,USER,ITEM,split_index,RATING_PRED
0,498,317,3,881957625,497,316,1,3.322314
1,498,179,4,881961133,497,178,1,3.322314
2,498,174,3,881956953,497,173,1,3.322314
3,498,423,3,881957267,497,422,1,3.322314
4,498,181,2,881955014,497,180,1,3.322314


In [47]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [48]:
mean_absolute_error(rating_evaluate_df.RATING, rating_evaluate_df.RATING_PRED)

0.8365870134614605

### For Ranking

In [49]:
from reco.evaluate import user_item_crossjoin, filter_by, get_top_k_items
from reco.evaluate import precision_at_k, recall_at_k, ndcg_at_k

In [50]:
def recommend_topk(data, train, k=5):
    
    """
    Params:
        data (pandas.DataFrame): DataFrame of entire rating data
        train (pandas.DataFrame): DataFrame of train rating data
        k (int): number of items for each user

    Returns:
        pd.DataFrame: DataFrame of top k items for each user, sorted by `col_user` and `rank`
    
    """
    
    # Create the crossjoin for user-item
    user_item = user_item_crossjoin(data)
    
    # Get ranking prediction for each user-item,
    all_predictions = pd.merge(user_item, predictions_ranking, on="ITEM", how="left")
    
    # Handle Missing Values
    all_predictions.fillna(0, inplace=True)
    
    # Filter already seen items
    all_predictions_unseen = filter_by(all_predictions, train, ["USER", "ITEM"])
    
    
    recommend_topk_df = get_top_k_items(all_predictions_unseen, "USER", "RATING_PRED", k=5)
    
    return recommend_topk_df


In [51]:
ranking_topk = recommend_topk(DATA, train, 10)

In [52]:
ranking_topk.head()

Unnamed: 0,USER,ITEM,RATING_PRED,rank
0,0,287,373.0,1
1,0,293,355.0,2
2,0,285,352.0,3
3,0,299,306.0,4
4,0,312,272.0,5


In [53]:
eval_precision = precision_at_k(test, ranking_topk, k=10)
eval_recall = recall_at_k(test, ranking_topk, k=10)
eval_ndcg = ndcg_at_k(test, ranking_topk, k=10)

print("NDCG@K:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')


NDCG@K:	0.170889
Precision@K:	0.117024
Recall@K:	0.069829
