In [4]:
from collections import defaultdict

import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [6]:
raintgs_mat_train_file = "../data/rating_mats/ratings_mat_train.pickle"

with open(raintgs_mat_train_file, "rb") as fp:
    ratings_mat_train = pickle.load(fp)

inv_ratings_mat_train = defaultdict(dict)
for user_id, item_ratings in ratings_mat_train.items():
    for item_id, rating in item_ratings.items():
        inv_ratings_mat_train[item_id][user_id] = rating


# Aspect: User

In [55]:
def get_user_aspect_func():
    user_ids = []
    num_movies = []
    for user_id, item_ratings in ratings_mat_train.items():
        user_ids.append(user_id)
        num_movies.append(len(item_ratings))

    user_aspect_counts = pd.DataFrame({"user_id": user_ids, "num_movies": num_movies})
    print(user_aspect_counts.num_movies.describe())

    user_aspect_counts['qcut_label'] = pd.qcut(user_aspect_counts.num_movies.rank(method='first'), 100, labels=False)
    user_aspect_counts = user_aspect_counts.set_index("user_id")

    def get_user_aspect_qcut(user_id):
        return user_aspect_counts.loc[user_id]['qcut_label']

    print("\nUse func: get_user_aspect_qcut(user_id).")
    return get_user_aspect_qcut

get_user_aspect_qcut = get_user_aspect_func()

count    270646.000000
mean         91.348385
std         195.506129
min           1.000000
25%          14.000000
50%          28.000000
75%          88.000000
max       17384.000000
Name: num_movies, dtype: float64
Use func: get_user_aspect_qcut(user_id).


# Aspect: Movie

In [56]:
def get_movie_aspect_func():
    movie_ids = []
    num_users = []

    for movie_id, user_ratings in inv_ratings_mat_train.items():
        movie_ids.append(movie_id)
        num_users.append(len(user_ratings))

    movie_aspect_counts = pd.DataFrame({"movie_id": movie_ids, "num_users": num_users})
    print(movie_aspect_counts.num_users.describe())

    movie_aspect_counts['qcut_label'] = pd.qcut(movie_aspect_counts.num_users.rank(method='first'), 100, labels=False)
    movie_aspect_counts = movie_aspect_counts.set_index("movie_id")

    def get_movie_aspect_qcut(movie_id):
        return movie_aspect_counts.loc[movie_id]['qcut_label']

    print("\nUse func: get_movie_aspect_qcut(movie_id).")
    return get_movie_aspect_qcut

get_movie_aspect_qcut = get_movie_aspect_func()

count    44759.000000
mean       552.359861
std       2896.279995
min          1.000000
25%          2.000000
50%          8.000000
75%         67.000000
max      87416.000000
Name: num_users, dtype: float64

Use func: get_movie_aspect_qcut(movie_id).


# Aspect: Genre (categorical)

# Aspect: Top Actors

In [70]:
actors_one_hot_file = "../data/actor_one_hot.pickle"
with open(actors_one_hot_file, "rb") as fp:
    actors_one_hot_embeddigns = pickle.load(fp)

with open("../data/rating_mats/orig_to_new_relabel_map.pickle", "rb") as fp:
    orig_to_new_labebl_map = pickle.load(fp)

skipped_keys = []
updated_actors_one_hot_embeddigns = {}
for key, val in actors_one_hot_embeddigns.items():
    if int(key) in orig_to_new_labebl_map:
        updated_actors_one_hot_embeddigns[orig_to_new_labebl_map[int(key)]] = val
    else:
        skipped_keys.append(int(key))

with open("../data/updated_actor_one_hot.pickle", "wb") as fp:
    pickle.dump(updated_actors_one_hot_embeddigns, fp)


In [71]:
len(skipped_keys)

722

In [72]:
np.max(list(updated_actors_one_hot_embeddigns.keys()))

45114

In [75]:
updated_actors_one_hot_embeddigns[121].ravel()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.