In [None]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
# import unicodedata

In [None]:
# Set Locations
system_path = ''
data_path = system_path + '/Movie-Recommendation-System/Data/Reviews-1M/'

In [None]:
# Import one-hot encoded movies metadata
movies_df = pd.read_csv(data_path + 'movies_metadata_ohe.csv')
movies_df['itemId'] = movies_df.itemId.apply(lambda x: 'itemId_' + str(x))
movies_df = movies_df.set_index(movies_df.itemId) # set index so no sorting errors occur

# Import ratings data
ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)
ratings_df['userId'] = ratings_df.userId.apply(lambda x: 'userId_' + str(x))
ratings_df['itemId'] = ratings_df.itemId.apply(lambda x: 'itemId_' + str(x))

# Import users data
users_df = pd.read_csv(data_path + 'users.dat', 
                       sep = '::', header = None)
users_df.columns = ['userId', 'gender', 'age_group', 'occupation', 'zipcode']
users_df.drop(['zipcode'], axis = 1, inplace = True)
users_df['userId'] = users_df.userId.apply(lambda x: 'userId_' + str(x))

In [None]:
movies_df.head(5)

In [None]:
users_df.head(5)

In [None]:
ratings_df.head(5)

Find the unique set of movies that have ratings in ratings_df. Remove all movies without ratings from movies_df.
There are 3883 movies total, and only 3706 movies with ratings.

In [None]:
print(len(ratings_df.itemId.unique()))
#3706 < 3883 so some movies do not have any ratings, these movies can be removed for genre rating averaging
# keep only movies that have at least one user rating
distinct_rated_movies = ratings_df.itemId.unique()
movies_df_rated = movies_df.iloc[[item in distinct_rated_movies for item in movies_df.itemId]]
print(movies_df_rated.shape)

Transform the ratings dataframe (userId, itemId, label) to a userId x itemId dataframe with the labels (ratings) as the values. Fill all missing values, items that have not been rated by a user, with 0s. 

Then create the binarized version of this matrix, and again fill any missing values with 0s. This dataframe will have values of 1 where a rating exists, and 0 otherwise.

In [None]:
# Transform ratings dataframe to user_id x item_id dataframe with label (or rating) as the values
ratings_spread = ratings_df.pivot(index='userId',columns='itemId',values='label').fillna(0)
# Create binary valued dataframe from the transformed ratings dataframe. 
#   This should have values of 1 where the labels are >0 and 0 otherwise.
ratings_spread_binary = (ratings_spread/ratings_spread).fillna(0)

print('User Movie Ratings')
ratings_spread.head()

## Movie Genre Rating Summary

In [None]:
# Create movies dataframe subset that only contains the Movie Lens genre or IMDb genre.
genre_cols = [mg for mg in list(movies_df_rated.columns) if (mg[0:8] == 'ml_genre' or mg[0:10] == 'imdb_genre')]
movie_genres = movies_df_rated[genre_cols]
# Cross the new ratings dataframe with the genres dataframe 
#   - this is now a datframe with sum of ratings by genre for each user
user_genre_total = ratings_spread.dot(movie_genres)
# Cross the binarized ratings dataframe with the genres dataframe 
#   - this is now a dataframe that contains counts of rated movies by genre for each user
user_genre_count = ratings_spread_binary.dot(movie_genres)

In [None]:
ratings_spread_binary.head()

In [None]:
movie_genres.head()

In [None]:
# Perform element wise division on user genre total and user genre count to get user genre average
user_genre_avg = user_genre_total / user_genre_count
user_genre_avg = user_genre_avg.add_suffix('_avg_rating').reset_index()

users_metadata = pd.merge(users_df, user_genre_avg, on = 'userId')

print('User Metadata with Genre Averages')
users_metadata.head()

In [None]:
del movie_genres, user_genre_total, user_genre_count, user_genre_avg

## Actor and Director Ratings Summary

In [None]:
# Create movies dataframe subset that only contains the actors or directors
actor_cols = [mg for mg in list(movies_df_rated.columns) if mg[0:6] == 'actor_']
director_cols = [mg for mg in list(movies_df_rated.columns) if mg[0:9] == 'director_']
people_cols = actor_cols + director_cols
movie_people = movies_df_rated[people_cols]
# # Cross the binarized ratings dataframe with the genres dataframe 
# #   - this is now a dataframe that contains counts of rated movies by genre for each user
user_movie_people_count = ratings_spread_binary.dot(movie_people)

In [None]:
# For testing purposes, create list of Alfred Hitchcock movies (can be done for any director)
hitchcock_movies = list(movies_df_rated.itemId.iloc[np.where(movies_df_rated.director_Alfred_Hitchcock == 1)])
print(hitchcock_movies)

In [None]:
# Determine how many ratings were made for Hitchcock movies
ratings_df_temp = ratings_df.iloc[np.where([item in hitchcock_movies for item in ratings_df.itemId])]
print('number of ratings:', len(ratings_df_temp))

In [None]:
# Make sure that the ratings were spread correctly and that the number of ratings remains constant
print('number of ratings:', np.sum(np.sum(ratings_spread_binary[hitchcock_movies])))

In [None]:
# Confirm that the dot product maintained a correct number of ratings
print('number of ratings:', np.sum(user_movie_people_count.director_Alfred_Hitchcock))

In [None]:
actor_views = user_movie_people_count[actor_cols].apply(np.sum, axis = 0)
# This total will be much greater than the total number of ratings because an individual movie has multiple actors
print('total actor views: ', np.sum(actor_views))
director_views = user_movie_people_count[director_cols].apply(np.sum, axis = 0)
# This total may be greater than the total number of ratings if one movie had multiple directors
print('total director views: ', np.sum(director_views))

In [None]:
del movie_people, user_movie_people_count, hitchcock_movies, ratings_df_temp

In [None]:
# Filter actors to top 200, or ~7800, based on number of views
top_200_actors = actor_views.sort_values(ascending = False).head(200)
top_200_actors_names = top_200_actors.index
top_200_actors

In [None]:
# Filter directors to top 50, of ~2200, based on number of views
top_50_directors = director_views.sort_values(ascending = False).head(50)
top_50_directors_names = top_50_directors.index
top_50_directors

In [None]:
# Filter out actors and directors not in the top 200 and 50
# Create movies dataframe subset that only contains the top actors and directors
movie_people = movies_df_rated[list(top_200_actors_names) + list(top_50_directors_names)]
# Cross the new ratings dataframe with the  movie_people dataframes
#   - this is now a datframe with sum of ratings by actor and director for each user
user_movie_people_total = ratings_spread.dot(movie_people)
# Cross the binarized ratings dataframe with the movie_people dataframes
#   - this is now a dataframe with counts of rated movies by actor and director for each user
user_movie_people_count = ratings_spread_binary.dot(movie_people)

In [None]:
user_movie_people_avg_ratings = user_movie_people_total / user_movie_people_count
user_movie_people_avg_ratings = user_movie_people_avg_ratings.add_suffix('_avg_rating').reset_index()

In [None]:
users_metadata = pd.merge(users_metadata, user_movie_people_avg_ratings, on = 'userId')
gender_OHE = pd.get_dummies(users_metadata.gender, prefix = 'gender')
age_group_OHE = pd.get_dummies(users_metadata.age_group, prefix = 'age_group')
users_metadata = pd.concat([users_metadata, gender_OHE, age_group_OHE], axis = 1, sort = False)
users_metadata.drop(['gender', 'age_group', 'occupation'], axis = 1, inplace = True)
users_metadata.head()

In [None]:
del movie_people, user_movie_people_total, user_movie_people_count, \
    user_movie_people_avg_ratings, movies_df_rated, ratings_spread, ratings_spread_binary, \
    top_200_actors, top_50_directors, ratings_df, actor_views, \
    director_views, users_df

## Convert to Spark Dataframe and Save

In [None]:
users_metadata['userId'] = users_metadata.userId.apply(lambda x: int(x[7:]))
movies_df['itemId'] = movies_df.itemId.apply(lambda x: int(x[7:]))

In [None]:
sorted_columns = list(users_metadata.columns.sort_values())
users_metadata = users_metadata[sorted_columns]
users_metadata.to_csv(data_path + 'users_metadata.csv', index = False)

In [None]:
movie_cols_to_keep = ['itemId', 'title', 'imdb_id', 'imdb_rating', 
                       'imdb_votes', 'metascore', 'runtime', 'year'] +\
        genre_cols + list(top_50_directors_names) + list(top_200_actors_names)
movie_metadata_ohe_subset = movies_df[movie_cols_to_keep]
movie_metadata_ohe_subset.to_csv(data_path + 'movies_metadata_ohe_subset.csv', index = False)

In [None]:
genre_and_people_cols = ['itemId'] + genre_cols + list(top_50_directors_names) + list(top_200_actors_names)
movie_genre_and_people_metadata_ohe_subset = movies_df[genre_and_people_cols]
movie_genre_and_people_metadata_ohe_subset.to_csv(data_path + 'movies_genre_and_people_metadata_ohe_subset.csv', index = False)