In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
from pathlib import Path
import os
from google.colab import drive
import re
import re, os, math, sklearn, datetime, pickle

In [2]:
from tensorflow import keras
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression

In [13]:
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/filmaholic')

Mounted at /content/drive


In [21]:
liked_movies = ['Toy Story (1995)', 'WALL·E (2008)', 'Ratatouille (2007)']

In [22]:
disliked_movies = ['Dark Knight Rises, The (2012)', 'Dark Knight, The (2008)', 'Avengers, The (2012)']

In [16]:
def title_to_id(liked_movies, disliked_movies):
    movies_mod = pd.read_csv('data/movies_mod.csv')

    liked_movies_df = pd.DataFrame({'title': liked_movies})
    disliked_movies_df = pd.DataFrame({'title': disliked_movies})

    liked_movies_df = liked_movies_df.merge(movies_mod, on='title', how='left')
    disliked_movies_df = disliked_movies_df.merge(movies_mod, on='title', how='left')

    return liked_movies_df, disliked_movies_df

In [6]:
def preprocess_genres(liked_movies, disliked_movies):

    liked_movies_df, disliked_movies_df = title_to_id(liked_movies, disliked_movies)

    total_user_like = pd.DataFrame()
    total_user_dislike = pd.DataFrame()

    liked_movies_df = liked_movies_df.iloc[:, 4:]
    disliked_movies_df = disliked_movies_df.iloc[:, 4:]

    liked_total_counts = 0
    liked_dict = {'War': 0, 'Animation': 0, 'Horror': 0, 'Sci-Fi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0,
                    'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'Film-Noir': 0, 'Drama': 0,
                    'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}

    disliked_total_counts = 0
    disliked_dict = {'War': 0, 'Animation': 0, 'Horror': 0, 'Sci-Fi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0,
                    'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'Film-Noir': 0, 'Drama': 0,
                    'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}

    for genre in list(liked_movies_df.columns):
        if len(liked_movies_df) == 0:
            pass
        else:
            liked_total_counts += sum(liked_movies_df[genre])

        if len(disliked_movies_df) == 0:
            pass
        else:
            disliked_total_counts += sum(disliked_movies_df[genre])

    for genre in list(liked_movies_df.columns):
        if liked_total_counts == 0:
            pass
        else:
            liked_genre_total_counts = sum(liked_movies_df[genre])
            liked_dict[genre] = liked_genre_total_counts/liked_total_counts

        if disliked_total_counts == 0:
            pass
        else:
            disliked_genre_total_counts = sum(disliked_movies_df[genre])
            disliked_dict[genre] = disliked_genre_total_counts/disliked_total_counts

    user_like_df = pd.DataFrame(liked_dict, index=[0])
    user_dislike_df = pd.DataFrame(disliked_dict, index=[0])

    if len(total_user_like) == 0:
        total_user_like = user_like_df
    else:
        total_user_like = pd.concat([total_user_like, user_like_df], ignore_index= True)

    if len(total_user_dislike) == 0:
        total_user_dislike = user_dislike_df
    else:
        total_user_dislike = pd.concat([total_user_dislike, user_dislike_df], ignore_index= True)

    return total_user_like, total_user_dislike


In [25]:
def preprocess_tags(liked_movies, disliked_movies):

    liked_movies_df, disliked_movies_df = title_to_id(liked_movies, disliked_movies)

    like_dislike_tags = pd.DataFrame()

    like_tags_df = pd.DataFrame()
    dislike_tags_df = pd.DataFrame()

    with open('data/final/vectorized_dict.pkl', 'rb') as reader:
      vectorized_dict = pickle.load(reader)

    for index, row in liked_movies_df.iterrows():

        temp_movie_df = pd.read_csv('data/movies_tags/{}.csv'.format(str(int(row.movieId))))

        if len(like_tags_df) == 0:
            like_tags_df = temp_movie_df
        else:
            like_tags_df = pd.concat([like_tags_df, temp_movie_df], ignore_index= True)

    for index, row in disliked_movies_df.iterrows():

        temp_movie_df = pd.read_csv('data/movies_tags/{}.csv'.format(str(int(row.movieId))))

        if len(dislike_tags_df) == 0:
            dislike_tags_df = temp_movie_df
        else:
            dislike_tags_df = pd.concat([dislike_tags_df, temp_movie_df], ignore_index= True)


    try:
        like_tags_list = list(like_tags_df.tag)
        dislike_tags_list = list(dislike_tags_df.tag)
    except Exception:
        print('exception')

    like_dict = {}
    dislike_dict = {}

    for tag in like_tags_list:
        like_dict[tag] = like_tags_list.count(tag) * -1

    for tag in dislike_tags_list:
        dislike_dict[tag] = dislike_tags_list.count(tag) * -1

    like_tags_counted = sorted(like_dict, key= lambda tag: like_dict[tag])
    dislike_tags_counted = sorted(dislike_dict, key= lambda tag: dislike_dict[tag])

    like_tags_vectorized = []
    dislike_tags_vectorized = []

    if len(like_tags_counted) < 50:
        num_like_tags = len(like_tags_counted)
    else:
        num_like_tags = 50

    if len(dislike_tags_counted) < 50:
        num_dislike_tags = len(like_tags_counted)
    else:
        num_dislike_tags = 50

    for tag in like_tags_counted[:num_like_tags]:
        try:
            tag_vector = vectorized_dict[tag]
            like_tags_vectorized.append(tag_vector)
        except Exception:
            pass

    for tag in dislike_tags_counted[:num_dislike_tags]:
        try:
            tag_vector = vectorized_dict[tag]
            dislike_tags_vectorized.append(tag_vector)
        except Exception:
            pass

    if len(like_tags_vectorized) > 20 and len(dislike_tags_vectorized) > 20:
      like_dislike_dict = {}

      for x in range(20):
          like_dislike_dict['LIKE_' + str(x)] = like_tags_vectorized[x]
          like_dislike_dict['DISLIKE_' + str(x)] = dislike_tags_vectorized[x]

      like_dislike_tags = pd.DataFrame(like_dislike_dict, index=[0])

      like_dislike_tags_int = like_dislike_tags.astype('int64')

      return like_dislike_tags_int
    else:
      print('Vector not long enough')

In [26]:
l_genres, d_genres = preprocess_genres(liked_movies, disliked_movies)

In [27]:
l_d_tags = preprocess_tags(liked_movies, disliked_movies)

In [28]:
def top_10_recommendations(liked_movies: list, disliked_movies: list, like_genres, dislike_genres, like_dislike_tags):

    movies_mod = pd.read_csv('data/movies_mod.csv')

    watched = liked_movies + disliked_movies

    # identifying not watched movies
    not_watched = list(movies_mod.movieId)

    for movie in watched:
        if movie in not_watched:
            not_watched.remove(movie)

    # genres

    # changing column names to differenciate liked genres and disliked genres, and movie genres
    like_columns = list(like_genres.columns)
    like_columns_modified = []
    for column in like_columns:
        modify_column = 'user_like_' + column
        like_columns_modified.append(modify_column)
    like_genres.columns = like_columns_modified

    dislike_columns = list(dislike_genres.columns)
    dislike_columns_modified = []
    for column in dislike_columns:
        modify_column = 'user_dislike_' + column
        dislike_columns_modified.append(modify_column)
    dislike_genres.columns = dislike_columns_modified

    # adding 'fake' movieId of -1 to the 3 dfs

    like_genres['userId'] = -999
    dislike_genres['userId'] = -999
    like_dislike_tags['userId'] = -999

    # tags

    movie_tags_df = pd.read_csv('data/final/movie_tags_df.csv')

    # adding a column with all not watched movies, then merging movie information (genres and tags profiles of movies)
    template_df = pd.DataFrame({'movieId': not_watched}, index= list(range(len(not_watched))))
    template_df = template_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    template_df = template_df.merge(movie_tags_df, how= 'left', on= 'movieId').dropna()
    del movie_tags_df

    # adding a column with the userId on all rows, then merging user information (genres and tags profiles of users)
    template_df['userId'] = -999
    template_df = template_df.merge(like_genres, how= 'left', on= 'userId').dropna()
    del like_genres
    template_df = template_df.merge(dislike_genres, how= 'left', on= 'userId').dropna()
    del dislike_genres
    template_df = template_df.merge(like_dislike_tags, how= 'left', on= 'userId').dropna()
    del like_dislike_tags

    # generating the columns for the random forest input
    rf_columns = []
    for x in range(20):
        rf_columns.append('LIKE_' + str(x))
        rf_columns.append('DISLIKE_' + str(x))
    for x in range(5):
        rf_columns.append('TAG_' + str(x))

    # separating the 3 inputs for the neural network
    genres_like_input = template_df.loc[:, like_columns_modified]
    genres_dislike_input = template_df.loc[:, dislike_columns_modified]
    genres_movie_input = template_df.loc[:, like_columns]

    # separating the input for the random forest
    tags_input = template_df.loc[:, rf_columns]

    # saving a list with the not-watched movieIds
    movieId_list = list(template_df.movieId)

    del template_df

    # loading models
    genres_model = keras.models.load_model('models/genres_model.h5', compile=True)
    tags_model = pickle.load(open('models/tags_model.sav', 'rb'))
    combine_model = pickle.load(open('models/combine_model.sav', 'rb'))

    # predicting with the genres and tags models
    genres_model_predictions = (genres_model.predict(x= [genres_like_input, genres_dislike_input, genres_movie_input])) * 5
    tags_model_predictions = tags_model.predict(tags_input)

    # transforming the neural network prediction into a list
    genres_model_predictions_list = []
    for prediction in genres_model_predictions:
        genres_model_predictions_list.append(prediction[0])

    # using both predictions to predict with the combined model
    combine_input = pd.DataFrame({'genres_model': genres_model_predictions_list,
                                  'tag_model': tags_model_predictions},
                                 index= list(range(len(genres_model_predictions))))
    combine_model_predictions = combine_model.predict(combine_input)

    # rounding predictions that end up out of bounds
    combine_model_predictions_rounded = []
    for prediction in combine_model_predictions:
        rounded = prediction
        if rounded > 5:
            rounded = 5
        elif rounded < 0.5:
            rounded = 0.5
        combine_model_predictions_rounded.append(rounded)

    # creating dataframe with predictions
    # predictions_df = pd.DataFrame({'movieId': movieId_list,
    #                                'genres_predictions': genres_model_predictions_list,
    #                               'tags_predictions': tags_model_predictions,
    #                               'combine_predictions': combine_model_predictions_rounded},
    #                              index= list(range(len(movieId_list))))

    predictions_df = pd.DataFrame({'movieId': movieId_list,
                              'prediction': combine_model_predictions_rounded},
                              index= list(range(len(movieId_list))))

    # getting top and bottom predictions
    best_movies_df = predictions_df.sort_values(by=['prediction'], ascending=False).iloc[:20, :]
    worst_movies_df = predictions_df.sort_values(by=['prediction'], ascending=True).iloc[:20, :]

    # adding rest of information about the movie
    best_movies_df = best_movies_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    worst_movies_df = worst_movies_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    del movies_mod

    return predictions_df, best_movies_df, worst_movies_df

In [29]:
predictions_df, best_movies_df, worst_movies_df = top_10_recommendations(liked_movies, disliked_movies, l_genres, d_genres, l_d_tags)



[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


In [31]:
worst_movies_df

Unnamed: 0,movieId,prediction,title,genres,YEAR,Fantasy,War,Musical,Western,Children,...,Animation,Horror,Comedy,Thriller,Drama,Mystery,Documentary,Action,Film-Noir,None
0,2985,0.73914,RoboCop (1987),Action|Crime|Drama|Sci-Fi|Thriller,1987,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0
1,2288,0.74395,"Thing, The (1982)",Action|Horror|Sci-Fi|Thriller,1982,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
2,3535,0.745857,American Psycho (2000),Crime|Horror|Mystery|Thriller,2000,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,0
3,1199,0.764954,Brazil (1985),Fantasy|Sci-Fi,1985,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,56174,0.795924,I Am Legend (2007),Action|Horror|Sci-Fi|Thriller|IMAX,2007,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
5,32587,0.809581,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,2005,0,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
6,353,0.829255,"Crow, The (1994)",Action|Crime|Fantasy|Thriller,1994,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
7,70286,0.830061,District 9 (2009),Mystery|Sci-Fi|Thriller,2009,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
8,1092,0.860261,Basic Instinct (1992),Crime|Mystery|Thriller,1992,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
9,185,0.862623,"Net, The (1995)",Action|Crime|Thriller,1995,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
