In [34]:
from sklearn.linear_model import RidgeCV

from collections import defaultdict

import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

%matplotlib inline

In [35]:
# Inputs:
val_preds_file = "../data/inferenced_outpus/val_merged.csv"
test_preds_file = "../data/inferenced_outpus/test_merged.csv"

In [36]:
ordered_columns = ["user_id", "movie_id", "y_true", "CF_Item", "CF_User", "LFM", "CB_Plot", "CB_Poster"]

# Load validation preds df:
val_preds_df = pd.read_csv(val_preds_file)[ordered_columns]
test_preds_df = pd.read_csv(test_preds_file)[ordered_columns]

# These are the features
model_col_names = ["CF_Item", "CF_User", "LFM", "CB_Plot", "CB_Poster"]

# 'y_true' is the ground-truth.
# index (key) = (user_id, movie_id)

# Pre-analysis: Building Aspect based getter functions.

In [38]:
raintgs_mat_train_file = "../data/rating_mats/ratings_mat_train.pickle"

with open(raintgs_mat_train_file, "rb") as fp:
    ratings_mat_train = pickle.load(fp)

inv_ratings_mat_train = defaultdict(dict)
for user_id, item_ratings in ratings_mat_train.items():
    for item_id, rating in item_ratings.items():
        inv_ratings_mat_train[item_id][user_id] = rating


##### Aspect: User

In [39]:
def get_user_aspect_func():
    user_ids = []
    num_movies = []
    for user_id, item_ratings in ratings_mat_train.items():
        user_ids.append(user_id)
        num_movies.append(len(item_ratings))

    user_aspect_counts = pd.DataFrame({"user_id": user_ids, "num_movies": num_movies})
    print(user_aspect_counts.num_movies.describe())

    user_aspect_counts['qcut_label'] = pd.qcut(user_aspect_counts.num_movies.rank(method='first'), 100, labels=False)
    user_aspect_counts = user_aspect_counts.set_index("user_id")

    def get_user_aspect_qcut(user_id):
        return user_aspect_counts.loc[user_id]['qcut_label']

    print("\nUse func: get_user_aspect_qcut(user_id).")
    return get_user_aspect_qcut


get_user_aspect_qcut = get_user_aspect_func()

count    270646.000000
mean         91.348385
std         195.506129
min           1.000000
25%          14.000000
50%          28.000000
75%          88.000000
max       17384.000000
Name: num_movies, dtype: float64

Use func: get_user_aspect_qcut(user_id).


##### Aspect: Movie

In [40]:
def get_movie_aspect_func():
    movie_ids = []
    num_users = []

    for movie_id, user_ratings in inv_ratings_mat_train.items():
        movie_ids.append(movie_id)
        num_users.append(len(user_ratings))

    movie_aspect_counts = pd.DataFrame({"movie_id": movie_ids, "num_users": num_users})
    print(movie_aspect_counts.num_users.describe())

    movie_aspect_counts['qcut_label'] = pd.qcut(movie_aspect_counts.num_users.rank(method='first'), 100, labels=False)
    movie_aspect_counts = movie_aspect_counts.set_index("movie_id")

    def get_movie_aspect_qcut(movie_id):
        return movie_aspect_counts.loc[movie_id]['qcut_label']

    print("\nUse func: get_movie_aspect_qcut(movie_id).")
    return get_movie_aspect_qcut


get_movie_aspect_qcut = get_movie_aspect_func()

count    44759.000000
mean       552.359861
std       2896.279995
min          1.000000
25%          2.000000
50%          8.000000
75%         67.000000
max      87416.000000
Name: num_users, dtype: float64

Use func: get_movie_aspect_qcut(movie_id).


##### TODO: Aspect: Genre (categorical)

##### TODO: Aspect: Top Actors

# Analysis 1: Building ensemble and getting overall results table

In [46]:
# Trains on val_df using RidgeCV and predicts and appends to test_df.
def train_and_add_ensemble(val_preds_df, test_preds_df, model_col_names, ensemble_model_name):
    y_train = val_preds_df['y_true'].values
    X_train = val_preds_df[model_col_names].values
    
    ensemble_model = RidgeCV(normalize=True, alphas=[1e-3, 1e-2, 1e-1, 1], cv=5).fit(X_train, y_train)
    print("Trained Ridge ensemble model with optimal alpha found: %s" % ensemble_model.alpha_)

    X_test = test_preds_df[model_col_names].values
    y_test = test_preds_df['y_true'].values

    y_test_ensemble_preds = ensemble_model.predict(X_test)
    new_test_preds = copy.copy(test_preds_df)
    new_test_preds[ensemble_model_name] = y_test_ensemble_preds
    return new_test_preds


In [47]:
# Add ensemble model to the test_preds.
ensemble_model_name = "Ensemble"
test_preds_df = train_and_add_ensemble(val_preds_df, test_preds_df, model_col_names, ensemble_model_name)

Trained Ridge ensemble model with optimal alpha found: 0.001


In [48]:
test_preds_df.head()

Unnamed: 0,user_id,movie_id,y_true,CF_Item,CF_User,LFM,CB_Plot,CB_Poster,ensemble,Ensemble
0,232116,5344,3.0,2.747471,2.538211,2.769688,2.369373,2.307174,2.754996,2.754996
1,232116,5044,3.0,3.047979,3.271448,3.465246,2.425986,2.852801,3.485054,3.485054
2,232116,2146,3.5,3.272716,3.000591,3.230099,1.946241,2.471931,3.238311,3.238311
3,232116,2513,2.5,3.679375,3.356273,3.48644,2.086815,3.180218,3.495867,3.495867
4,232116,1825,2.5,3.393934,2.902007,3.444232,2.191972,3.565871,3.479163,3.479163
