In [1]:
import re, os, math, datetime, pickle, joblib

import pandas as pd
import numpy as np

from pathlib import Path
from google.cloud import bigquery

from tensorflow import keras
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression

2023-09-15 12:09:21.213878: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from filmaholic.ml_logic.data import upload_to_bigquery, upload_to_cloud_platform, get_data_bigquery
from filmaholic.params import *

In [3]:
from multiprocessing import Pool

In [4]:
GCP_PROJECT

'filmaholic-398017'

# Loading the data

In [5]:
PROJECT = "filmaholic-398017"
DATASET = "filmaholic"

tables_to_query = ['movies', 'tags', 'ratings']
dfs = {}

for table in tables_to_query:
        query = f"""
        SELECT * FROM `{PROJECT}.{DATASET}.raw-{table}`
        """

        client = bigquery.Client(project=PROJECT)
        query_job = client.query(query)
        result = query_job.result()
        dfs[table] = result.to_dataframe()

        print(f"Table {table} loaded from bigquery")

Table movies loaded from bigquery
Table tags loaded from bigquery
Table ratings loaded from bigquery


# Subsetting the data

### Merging ratings & movies dfs

In [7]:
merged = dfs['ratings'].merge(dfs['movies'], on='movieId', how='left')

### Popular titles & active users (not being used)

In [9]:
# avg_ratings_per_movie = pd.DataFrame(merged.groupby('title')['rating'].mean())
# avg_ratings_per_movie['total_ratings'] = pd.DataFrame(merged.groupby('title')['rating'].count())
# avg_ratings_per_movie = avg_ratings_per_movie.sort_values('total_ratings',ascending=False)

In [10]:
# popular_titles = avg_ratings_per_movie[avg_ratings_per_movie.total_ratings>10000].index
# popular_titles = list(popular_titles)

In [11]:
# len(popular_titles)

In [12]:
# merged_reduced_per_movies = merged[merged.title.isin(popular_titles)]

In [13]:
# avg_ratings_per_user = pd.DataFrame(merged.groupby('userId')['rating'].mean())
# avg_ratings_per_user['total_ratings'] = pd.DataFrame(merged.groupby('userId')['rating'].count())
# avg_ratings_per_user = avg_ratings_per_user.sort_values('total_ratings',ascending=False)
# avg_ratings_per_user

In [14]:
# active_users = avg_ratings_per_user[avg_ratings_per_user.total_ratings>700].index
# active_users = list(active_users)

In [15]:
# len(active_users)

In [16]:
# merged_reduced_per_active_users = merged[merged.userId.isin(active_users)]

### Random user sample (not being used)

In [17]:
# unique_users = merged.userId.drop_duplicates().reset_index()
# unique_users.drop('index', axis=1, inplace=True)
# random_users = list(unique_users.sample(frac=0.05, random_state=0).userId)

In [18]:
# merged.userId = merged.userId.astype('string')
# random_users_movies = list(merged[merged.userId.isin(random_users)].movieId)

### Baseline

In [6]:
# avg_ratings_per_user = pd.DataFrame(merged.groupby('userId')['rating'].count())
# avg_ratings_per_user = avg_ratings_per_user.sort_values('rating',ascending=False)
# subset_users = avg_ratings_per_user[:5000]
# subset_users = list(subset_users.index)

# subset_movies = merged[merged.userId.isin(subset_users)]

# avg_ratings_per_movie = pd.DataFrame(subset_movies.groupby('movieId')['rating'].count())
# avg_ratings_per_movie = avg_ratings_per_movie.sort_values('rating',ascending=False)

# subset_movies_popular = avg_ratings_per_movie[:500]

# user_list = list(set(subset_movies.userId))
# movies_list = list(set(subset_movies_popular.index))

### Final

In [8]:
avg_ratings_per_user = pd.DataFrame(merged.groupby('userId')['rating'].count())
avg_ratings_per_user = avg_ratings_per_user.sort_values('rating',ascending=False)
subset_users = avg_ratings_per_user[:20000]
subset_users = list(subset_users.index)

subset_movies = merged[merged.userId.isin(subset_users)]

avg_ratings_per_movie = pd.DataFrame(subset_movies.groupby('movieId')['rating'].count())
avg_ratings_per_movie = avg_ratings_per_movie.sort_values('rating',ascending=False)

subset_movies_popular = avg_ratings_per_movie[:20000]

user_list = list(set(subset_movies.userId))
movies_list = list(set(subset_movies_popular.index))

In [9]:
len(user_list)

20000

In [10]:
len(movies_list)

20000

### Saving dataframe subsets

In [11]:
tags = dfs['tags'].copy()
tags = tags[tags.userId.isin(user_list)]
tags = tags[tags.movieId.isin(movies_list)]
tags = tags.drop('timestamp', axis=1)

In [12]:
movies = dfs['movies'].copy()
movies = movies[movies.movieId.isin(movies_list)]

In [13]:
ratings = dfs['ratings'].copy()
ratings.rating = ratings.rating.astype('float')
ratings = ratings[ratings.userId.isin(user_list)]
ratings = ratings[ratings.movieId.isin(movies_list)]

In [13]:
tags.to_csv('data_/tags_subset.csv', index=False)
movies.to_csv('data_/movies_subset.csv', index=False)
ratings.to_csv('data_/ratings_subset.csv', index=False)

In [10]:
# upload_to_bigquery(GCP_PROJECT, BQ_DATASET, 'subset-tags', tags)
# upload_to_bigquery(GCP_PROJECT, BQ_DATASET, 'subset-movies', movies)
# upload_to_bigquery(GCP_PROJECT, BQ_DATASET, 'subset-ratings', ratings)

# Correlation (not going to be used)

In [29]:
# n_chunks = 25
# arange = np.arange(n_chunks+1)
# chunk_size = 162541//n_chunks
# arange = arange*(chunk_size)

# pivoted = pd.DataFrame()

# for i in arange:
#   chunk = merged[(merged.userId>i)|(merged.userId<=i+chunk_size)]
#   chunk = chunk.pivot_table(index='userId', columns='movieId', values='rating')

#   print("Iteration")

#   pivoted.append(chunk, axis=1)

In [30]:
# pivoted = merged_reduced.pivot_table(index='userId', columns='title', values='rating')

In [31]:
# toy_story = pivoted.corrwith(pivoted['Toy Story (1995)'])
# toy_story.head()

In [32]:
# toy_story_recomm = pd.DataFrame(toy_story,columns=['Correlation'])
# toy_story_recomm.dropna(inplace=True)
# toy_story_recomm = toy_story_recomm.sort_values('Correlation',ascending=False)

In [33]:
# toy_story_recomm = toy_story_recomm.join(avg_ratings['total_ratings'])
# toy_story_recomm = toy_story_recomm[toy_story_recomm['total_ratings']>30]

In [34]:
# toy_story_recomm.movieId = toy_story_recomm.movieId.astype('int32')
# toy_story_recomm.join(merged, on='movieId', how='left')

# Baseline model

In [13]:
def f(args):
    return args[0]*args[1]

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(f, [[1, 2], [4, 5]]))

[2, 20]


In [31]:
cached_movies = []
for movie in movies_list:
    if os.path.isfile(f'data_/movies_tags/{movie}.csv'):
        cached_movies.append(movie)

In [16]:
# here, we are counting the most used tags PER MOVIE
# a new csv file is created for each movieId, with each tag and its count

def func_1(args):
    
    if os.path.isfile(f'data_/movies_tags/{args[1]}.csv'):
        return
    
    else:
        args[0].movieId = args[0].movieId.astype('string')
        tags_movie = args[0][args[0].movieId == args[1]].copy().drop('movieId', axis=1)
        tags_movie['COUNT'] = 1
        tags_movies_counts = tags_movie.groupby(['tag']).count()
        tags_movies_counts = tags_movies_counts.sort_values(by=['COUNT'], ascending= False).reset_index()

        # upload_to_bigquery(GCP_PROJECT, BQ_DATASET_MOVIES_TAGS, f'{args[1]}', tags_movies_counts)

        tags_movies_counts.to_csv(f'data_/movies_tags/{args[1]}.csv', index = False)
        print(f'{args[2]}')

In [15]:
args_list = []

for row in range(len(movies_list)):
    args_list.append([tags, movies_list[row], row])

In [None]:
print('over')

In [None]:
with Pool(10) as p:
    p.map(func_1, args_list)

In [42]:
# (ENDED UP NOT BEING NECESSARY)
# here, we are counting the most used tags PER USER
# a new csv file is created for each userId, with each tag and its count

# for i, userId in enumerate(user_list):
#     tags.userId = tags.userId.astype('string')
#     tags_user = tags[tags.userId == userId].copy().drop('userId', axis= 1)
#     tags_user['COUNT'] = 1
#     tags_user_counts = tags_user.groupby(['tag']).count()
#     tags_user_counts = tags_user_counts.sort_values(by=['COUNT'], ascending= False).reset_index()
#     tags_user_counts.to_csv('data_/users_tags/' + str(userId) + '.csv', index = False)
#     print(f'csv {i} created')

In [39]:
# 11
# here, we are preprocessing the movies dataframe:
# 1) moving years to a separated column, and removing movies without year information
# 2) separating genres in separated columns, one for each genre
# 3) addin avg user rating and std (NOT DOING THIS STEP, TOO TIME CONSUMING)

movies = pd.read_csv('data_/movies_subset.csv')


movies['YEAR'] = 0
# movies['UPPER_STD'] = 0
# movies['LOWER_STD'] = 0
# movies['AVG_RATING'] = 0
# movies['VIEW_COUNT'] = 0

genres_list = []
for index, row in movies.iterrows():
    try:
        genres = row.genres.split('|')
        genres_list.extend(genres)
    except:
        genres_list.append(row.genres)

genres_list = list(set(genres_list))
genres_list.remove('IMAX')

# Replace '(no genres listed)' with 'None'
try:
  genres_list.remove('(no genres listed)')
  genres_list.append('None')
except:
  genres_list.append('None')

for genre in genres_list:
    
    if genre == 'Sci-Fi':
        genre = 'SciFi'
    
    if genre == 'Film-Noir':
        genre = 'FilmNoir'
    
    movies[genre] = 0

exceptions = 0
for index, row in movies.iterrows():
    movieId = row.movieId
    title = row.title

    try:
        genres = row.genres.split('|')
    except Exception:
        genres = list(row.genres)

    try:
        matcher = re.compile('\(\d{4}\)')
        parenthesis_year = matcher.search(title).group(0)
        matcher = re.compile('\d{4}')
        year = matcher.search(parenthesis_year).group(0)

        movies.loc[index, 'YEAR'] = int(year)

    except Exception:
        exceptions += 1
        print(exceptions)
        pass

    # try:
        # movie_ratings = ratings[ratings.movieId == movieId]
        # std = np.std(movie_ratings.rating)
        # average_rating = np.mean(movie_ratings.rating)

        # upper_std = average_rating + std

        # if upper_std > 5:
        #     upper_std = 5

        # lower_std = average_rating - std

        # if lower_std < 0.5:
        #     lower_std = 0.5

        # view_count = len(movie_ratings)

        # movies.loc[index, 'UPPER_STD'] = upper_std
        # movies.loc[index, 'LOWER_STD'] = lower_std
        # movies.loc[index, 'AVG_RATING'] = average_rating
        # movies.loc[index, 'VIEW_COUNT'] = view_count

    # except Exception:
    #     pass

    if 'IMAX' in genres:
        genres.remove('IMAX')

    if '(no genres listed)' in genres:
        genres.remove('(no genres listed)')
        genres.append('None')

    for genre in genres:
        if genre == 'Sci-Fi':
            genre = 'SciFi'
    
        if genre == 'Film-Noir':
            genre = 'FilmNoir'
            
        movies.loc[index, genre] = 1

movies_mod = movies[movies.YEAR != 0]

movies_mod.to_csv('data_/movies_mod.csv', index = False)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [111]:
movies_mod = pd.read_csv('data_/movies_mod.csv')
ratings = pd.read_csv('data_/ratings_subset.csv')

In [112]:
ratings_movies = ratings.merge(movies_mod, how= 'left', on= 'movieId').dropna()
avg_ratings_per_movie = pd.DataFrame(ratings_movies.groupby('movieId')['rating'].count().reset_index())
avg_ratings_per_movie.columns = ['movieId', 'count']
ratings_movies = ratings_movies.merge(avg_ratings_per_movie, how='left', on='movieId').dropna()
ratings_movies = ratings_movies.drop_duplicates('title')

In [113]:
b_1990 = ratings_movies[ratings_movies.YEAR < 1990]
b_1990 = b_1990.sort_values('count',ascending=False)

a_1990_b_2000 = ratings_movies[(ratings_movies.YEAR > 1990)&(ratings_movies.YEAR <= 2000)]
a_1990_b_2000 = a_1990_b_2000.sort_values('count',ascending=False)

a_2000_b_2010 = ratings_movies[(ratings_movies.YEAR > 2000)&(ratings_movies.YEAR <= 2010)]
a_2000_b_2010 = a_2000_b_2010.sort_values('count',ascending=False)

a_2010_b_2015 = ratings_movies[(ratings_movies.YEAR > 2010)&(ratings_movies.YEAR <= 2015)]
a_2010_b_2015 = a_2010_b_2015.sort_values('count',ascending=False)

a_2015 = ratings_movies[ratings_movies.YEAR > 2015]
a_2015 = a_2015.sort_values('count',ascending=False)

In [133]:
chosen_movies_list = list(b_1990.movieId)[:50] + list(a_1990_b_2000.movieId)[:70] + list(a_2000_b_2010.movieId)[:100] + list(a_2010_b_2015.movieId)[:150] + list(a_2015.movieId)[:150]

In [134]:
len(chosen_movies_list)

520

In [43]:
# 12
# here, we are creating two new dataframes: one with each user's most liked genres, and one with each user's most disliked genres
# the dataframe will have userId as rows, and genres as columns, and values will represent a percentage (liked count for that genre / liked count for all genre)

movies_mod = pd.read_csv('data_/movies_mod.csv')
ratings = pd.read_csv('data_/ratings_subset.csv')
ratings_movies = ratings.merge(movies_mod, how= 'left', on= 'movieId').dropna()

userId_list = list(set(ratings_movies.userId))

total_user_like = pd.DataFrame()
total_user_dislike = pd.DataFrame()

counter_1 = 0
counter_2 = .02

for userId in userId_list:
    temp = ratings_movies[ratings_movies.userId == userId]
    temp_like = temp[temp.rating >= 4].iloc[:, 7:]
    temp_dislike = temp[temp.rating < 4].iloc[:, 7:]

    liked_total_counts = 0
    liked_dict = {'userId': userId,'War': 0, 'Animation': 0, 'Horror': 0, 'SciFi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0,
                  'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'FilmNoir': 0, 'Drama': 0,
                  'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}

    disliked_total_counts = 0
    disliked_dict = {'userId': userId,'War': 0, 'Animation': 0, 'Horror': 0, 'SciFi': 0, 'Fantasy': 0, 'Thriller': 0, 'Crime': 0, 'Mystery': 0,
                  'Documentary': 0, 'Children': 0, 'Action': 0, 'Adventure': 0, 'Musical': 0,'FilmNoir': 0, 'Drama': 0,
                  'Romance': 0, 'Comedy': 0, 'Western': 0, 'None': 0}

    counter_1 += 1

    if counter_1 / len(userId_list) >= counter_2:
        print(counter_1 / len(userId_list) * 100, '%')
        counter_2 += .02

    for genre in list(temp_like.columns):
        if len(temp_like) == 0:
            pass
        else:
            liked_total_counts += sum(temp_like[genre])

        if len(temp_dislike) == 0:
            pass
        else:
            disliked_total_counts += sum(temp_dislike[genre])

    for genre in list(temp_like.columns):
        if liked_total_counts == 0:
            pass
        else:
            liked_genre_total_counts = sum(temp_like[genre])
            liked_dict[genre] = liked_genre_total_counts/liked_total_counts

        if disliked_total_counts == 0:
            pass
        else:
            disliked_genre_total_counts = sum(temp_dislike[genre])
            disliked_dict[genre] = disliked_genre_total_counts/disliked_total_counts

    user_like_df = pd.DataFrame(liked_dict, index=[0])
    user_dislike_df = pd.DataFrame(disliked_dict, index=[0])

    if len(total_user_like) == 0:
        total_user_like = user_like_df
    else:
        total_user_like = pd.concat([total_user_like, user_like_df], ignore_index= True)

    if len(total_user_dislike) == 0:
        total_user_dislike = user_dislike_df
    else:
        total_user_dislike = pd.concat([total_user_dislike, user_dislike_df], ignore_index= True)

total_user_like.to_csv('data_/final/like_genres.csv', index = False)
total_user_dislike.to_csv('data_/final/dislike_genres.csv', index = False)

2.0 %
4.0 %
6.0 %
8.0 %
10.0 %
12.005 %
14.000000000000002 %
16.0 %
18.0 %
20.0 %
22.0 %
24.0 %
26.0 %
28.000000000000004 %
30.0 %
32.0 %
34.0 %
36.004999999999995 %
38.005 %
40.005 %
42.004999999999995 %
44.005 %
46.005 %
48.004999999999995 %
50.005 %
52.005 %
54.005 %
56.005 %
58.004999999999995 %
60.004999999999995 %
62.004999999999995 %
64.005 %
66.005 %
68.00500000000001 %
70.005 %
72.005 %
74.005 %
76.005 %
78.005 %
80.00500000000001 %
82.005 %
84.005 %
86.005 %
88.005 %
90.005 %
92.00500000000001 %
94.00500000000001 %
96.005 %
98.005 %


In [36]:
# here, we select a subset of the tag most common tags that we want used in our model

common_tags_df = tags.groupby(['tag']).count().sort_values('userId', ascending= False).copy().drop('movieId', axis= 1).reset_index()
common_tags_df = common_tags_df[common_tags_df.userId >= 35]

tags_to_remove = ['BD-R', 'DVD-Video']
common_tags_df = common_tags_df[~common_tags_df.tag.isin(tags_to_remove)]
                                    
common_tags_df.to_csv('data_/common_tags.csv', index = False)

In [37]:
common_tags = pd.read_csv('data_/common_tags.csv', index_col= False)

common_tags_list = list(set(common_tags.tag))
vector_counter = 0
vectorized_dict = {}

for tag in common_tags_list:
    vectorized_dict[tag] = vector_counter
    vector_counter += 1

with open('data_/final/vectorized_dict.pkl', 'wb') as writer:
    joblib.dump(vectorized_dict, writer)

In [13]:
tags_unique_users = list(tags.drop_duplicates('userId').userId)

with open('data_/final/vectorized_dict.pkl', 'rb') as reader:
    vectorized_dict = joblib.load(reader)

In [50]:
# args_list = []

# for row in range(len(tags_unique_users)):
#     args_list.append([ratings, tags_unique_users[row]])

In [61]:
# #13
# # here, we create the a "tag profile" for each user -> 20 tags most often related to movies he likes, and 20 tags most often related to movies he dislikes

# like_dislike_tags = pd.DataFrame()
# index_counter = 0

# def func_2(args):

#     temp_ratings_df = args[0][args[0].userId == args[1]]
#     like_tags_df = pd.DataFrame()
#     dislike_tags_df = pd.DataFrame()
    
#     print(f's:{args[1]}')

#     for index, row in temp_ratings_df.iterrows():
#         try:
#             if row.rating >= 4:
#                 # temp_movie_df = get_data_bigquery(GCP_PROJECT, BQ_DATASET_MOVIES_TAGS, f'{str(int(row.movieId))}')
#                 temp_movie_df = pd.read_csv('data_/movies_tags/{}.csv'.format(str(int(row.movieId))))

#                 if len(like_tags_df) == 0:
#                     like_tags_df = temp_movie_df

#                 else:
#                     like_tags_df = pd.concat([like_tags_df, temp_movie_df], ignore_index= True)

#             else:
#                 # temp_movie_df = get_data_bigquery(GCP_PROJECT, BQ_DATASET_MOVIES_TAGS, f'{str(int(row.movieId))}')
#                 temp_movie_df = pd.read_csv('data_/movies_tags/{}.csv'.format(str(int(row.movieId))))

#                 if len(dislike_tags_df) == 0:
#                     dislike_tags_df = temp_movie_df

#                 else:
#                     dislike_tags_df = pd.concat([dislike_tags_df, temp_movie_df], ignore_index= True)
#         except Exception:
#             print('exception 1')
#             pass

#     try:
#         like_tags_list = list(like_tags_df.tag)
#         dislike_tags_list = list(dislike_tags_df.tag)
#     except Exception:
#         print('exception 2')
#         return

#     like_dict = {}
#     dislike_dict = {}

#     for tag in like_tags_list:
#         like_dict[tag] = like_tags_list.count(tag) * -1

#     for tag in dislike_tags_list:
#         dislike_dict[tag] = dislike_tags_list.count(tag) * -1

#     like_tags_counted = sorted(like_dict, key= lambda tag: like_dict[tag])
#     dislike_tags_counted = sorted(dislike_dict, key= lambda tag: dislike_dict[tag])

#     like_tags_vectorized = []
#     dislike_tags_vectorized = []

#     if len(like_tags_counted) < 50:
#         num_like_tags = len(like_tags_counted)
#     else:
#         num_like_tags = 50

#     if len(dislike_tags_counted) < 50:
#         num_dislike_tags = len(like_tags_counted)
#     else:
#         num_dislike_tags = 50

#     for tag in like_tags_counted[:num_like_tags]:
#         try:
#             tag_vector = vectorized_dict[tag]
#             like_tags_vectorized.append(tag_vector)
#         except Exception:
#             print('exception3')
#             pass

#     for tag in dislike_tags_counted[:num_dislike_tags]:
#         try:
#             tag_vector = vectorized_dict[tag]
#             dislike_tags_vectorized.append(tag_vector)
#         except Exception:
#             print('exception4')
#             pass

#     if len(like_tags_vectorized) < 20 or len(dislike_tags_vectorized) < 20:
#         return

#     like_dislike_dict = {}

#     like_dislike_dict['userId'] = user

#     for x in range(20):
#         like_dislike_dict['LIKE_' + str(x)] = like_tags_vectorized[x]
#         like_dislike_dict['DISLIKE_' + str(x)] = dislike_tags_vectorized[x]

#     concat_df = pd.DataFrame(like_dislike_dict, index=[0])

#     if len(like_dislike_tags) == 0:
#         like_dislike_tags = concat_df

#     like_dislike_tags = pd.concat([like_dislike_tags, concat_df], ignore_index= True)
#     print(f'f:{args[1]}')
        
# like_dislike_tags_int = like_dislike_tags.astype('int64')
# like_dislike_tags_int.to_csv('data_/final/like_dislike_tags.csv', index = False)

In [16]:
len(tags_unique_users)

boundaries = [0, 450, 900, 1350, 1800, 2250, 2700, 3150, 3600, 4050, 4593]

args_list = []

for row in range(10):
    args_list.append([ratings, tags_unique_users[boundaries[row]:boundaries[row+1]]])

In [18]:
def func_3(args):
    
    print('start')
    
    ratings_tags = args[0]
    tags_unique_users = args[1]
    
    like_dislike_tags = pd.DataFrame()
    index_counter = 0

    progress_counter = 0

    with open('data_/final/vectorized_dict.pkl', 'rb') as reader:
        vectorized_dict = joblib.load(reader)
    
    for user in tags_unique_users:
        progress_counter += 1
        print(progress_counter)

        temp_ratings_df = ratings_tags[ratings_tags.userId == user]
        like_tags_df = pd.DataFrame()
        dislike_tags_df = pd.DataFrame()

        for index, row in temp_ratings_df.iterrows():
            try:
                if row.rating >= 4:
                    temp_movie_df = pd.read_csv('data_/movies_tags/{}.csv'.format(str(int(row.movieId))))

                    if len(like_tags_df) == 0:
                        like_tags_df = temp_movie_df

                    else:
                        like_tags_df = pd.concat([like_tags_df, temp_movie_df], ignore_index= True)

                else:
                    temp_movie_df = pd.read_csv('data_/movies_tags/{}.csv'.format(str(int(row.movieId))))

                    if len(dislike_tags_df) == 0:
                        dislike_tags_df = temp_movie_df

                    else:
                        dislike_tags_df = pd.concat([dislike_tags_df, temp_movie_df], ignore_index= True)
            except Exception:
                print('exception 1')
                pass

        try:
            like_tags_list = list(like_tags_df.tag)
            dislike_tags_list = list(dislike_tags_df.tag)
        except Exception:
            print('exception 2')
            continue

        like_dict = {}
        dislike_dict = {}

        for tag in like_tags_list:
            like_dict[tag] = like_tags_list.count(tag) * -1

        for tag in dislike_tags_list:
            dislike_dict[tag] = dislike_tags_list.count(tag) * -1

        like_tags_counted = sorted(like_dict, key= lambda tag: like_dict[tag])
        dislike_tags_counted = sorted(dislike_dict, key= lambda tag: dislike_dict[tag])

        like_tags_vectorized = []
        dislike_tags_vectorized = []

        if len(like_tags_counted) < 50:
            num_like_tags = len(like_tags_counted)
        else:
            num_like_tags = 50

        if len(dislike_tags_counted) < 50:
            num_dislike_tags = len(like_tags_counted)
        else:
            num_dislike_tags = 50

        for tag in like_tags_counted[:num_like_tags]:
            try:
                tag_vector = vectorized_dict[tag]
                like_tags_vectorized.append(tag_vector)
            except Exception:
                pass

        for tag in dislike_tags_counted[:num_dislike_tags]:
            try:
                tag_vector = vectorized_dict[tag]
                dislike_tags_vectorized.append(tag_vector)
            except Exception:
                pass

        if len(like_tags_vectorized) < 20 or len(dislike_tags_vectorized) < 20:
            continue

        like_dislike_dict = {}

        like_dislike_dict['userId'] = user

        for x in range(20):
            like_dislike_dict['LIKE_' + str(x)] = like_tags_vectorized[x]
            like_dislike_dict['DISLIKE_' + str(x)] = dislike_tags_vectorized[x]

        concat_df = pd.DataFrame(like_dislike_dict, index=[0])

        if len(like_dislike_tags) == 0:
            like_dislike_tags = concat_df

        else:
            print('concat')
            like_dislike_tags = pd.concat([like_dislike_tags, concat_df], ignore_index= True)

    like_dislike_tags_int = like_dislike_tags.astype('int64')
    like_dislike_tags_int.to_csv(f'data_/final/like_dislike_tags_{tags_unique_users[0]}.csv', index = False)

In [None]:
if __name__ == '__main__':
    with Pool(10) as p:
        p.map(func_3, args_list)

In [21]:
like_dislike_tags = pd.DataFrame()
for argu in args_list:
    concat_df = pd.read_csv(f'data_/final/like_dislike_tags_{argu[1][0]}.csv')
    if len(like_dislike_tags) == 0:
        like_dislike_tags = concat_df
    else:
        like_dislike_tags = pd.concat([like_dislike_tags, concat_df], ignore_index= True)
        
like_dislike_tags_int = like_dislike_tags.astype('int64')
like_dislike_tags_int.to_csv('data_/final/like_dislike_tags.csv', index = False)

In [24]:
movies_mod = pd.read_csv('data_/movies_mod.csv')
movieId_list = list(movies_mod.movieId)
del movies_mod
len(movieId_list)

19961

In [None]:
# 14
# here, we create the a "tag profile" for each movie -> 5 tags most often related that movie

movie_tags_df = pd.DataFrame()
index_counter = 0

progress_counter = 0

with open('data_/final/vectorized_dict.pkl', 'rb') as reader:
    vectorized_dict = joblib.load(reader)

for movie in movieId_list:
    progress_counter += 1

    try:
      temp_df = pd.read_csv('data_/movies_tags/{}.csv'.format(movie))

      if len(temp_df) < 5:
        continue

      vectorized_tag = []
      movie_tags = list(temp_df.tag)

      for tag in movie_tags:
          try:
              tag_vector = vectorized_dict[tag]
              vectorized_tag.append(tag_vector)
          except Exception:
            print('exception 1')
            pass

      if len(vectorized_tag) < 5:
          continue

      movie_tags_df.loc[index_counter, 'movieId'] = movie

      for x in range(5):
          movie_tags_df.loc[index_counter, 'TAG_' + str(x)] = vectorized_tag[x]

      index_counter += 1

    except Exception:
      print('exception 2')
      pass

movie_tags_df_int = movie_tags_df.astype('int64')
movie_tags_df_int.to_csv('data_/final/movie_tags_df.csv', index = False)

In [86]:
# 15

def stats(predictions, actual, flex_range=0.5):

    predictions_list = []
    round_list = np.arange(0.5, 5.5, 0.5)

    for value in predictions:
        value_original = value
        compare_diff = 99999
        value_round = 0

        for rating in round_list:
            compare_value = abs(value_original - rating)

            if compare_value < compare_diff:
                compare_diff = compare_value
                value_round = rating

        predictions_list.append(value_round)

    prediction_dict = {'PREDICTION': predictions_list, 'TRUE': list(actual)}

    prediction_compare_df = pd.DataFrame(prediction_dict)

    rating_accuracy = 0
    like_dislike_tp = 0
    like_dislike_tn = 0
    like_dislike_fp = 0
    like_dislike_fn = 0
    prediction_length = len(prediction_compare_df)

    rating_accuracy_flex = 0
    like_dislike_tp_flex = 0
    like_dislike_tn_flex = 0
    like_dislike_fp_flex = 0
    like_dislike_fn_flex = 0

    for index, row in prediction_compare_df.iterrows():
        predict_like = 0
        true_like = 0

        if row.PREDICTION >= 4:
            predict_like = 1

        if row.TRUE >= 4:
            true_like = 1

        if row.PREDICTION == row.TRUE:
            rating_accuracy += 1

        if predict_like == true_like:
            if predict_like == 1:
                like_dislike_tp += 1

            else:
                like_dislike_tn += 1

        else:
            if predict_like == 1:
                like_dislike_fp += 1

            else:
                like_dislike_fn += 1


        predict_like_flex = 0
        true_like_flex = 0

        if row.PREDICTION >= 3.5:
            predict_like_flex = 1

        if row.TRUE >= 3.5:
            true_like_flex = 1

        if row.PREDICTION >= (row.TRUE - flex_range) and row.PREDICTION <= (row.TRUE + flex_range):
            rating_accuracy_flex += 1

        if predict_like_flex == true_like_flex:
            if predict_like_flex == 1:
                like_dislike_tp_flex += 1

            else:
                like_dislike_tn_flex += 1

        else:
            if predict_like_flex == 1:
                like_dislike_fp_flex += 1

            else:
                like_dislike_fn_flex += 1

    rating_accuracy = rating_accuracy / prediction_length
    like_dislike_accuracy = (like_dislike_tp + like_dislike_tn) / prediction_length

    rating_accuracy_flex = rating_accuracy_flex / prediction_length
    like_dislike_accuracy_flex = (like_dislike_tp_flex + like_dislike_tn_flex) / prediction_length

    print('True Positive: {}, True Negative: {}, False Positive {}, False Negative {}'.format(like_dislike_tp, like_dislike_tn, like_dislike_fp, like_dislike_fn))
    print('Rating Accuracy: {}, Binary Accuracy (Like/Dislike) {}'.format(rating_accuracy, like_dislike_accuracy))
    print('FLEX: True Positive: {}, True Negative: {}, False Positive {}, False Negative {}'.format(like_dislike_tp_flex, like_dislike_tn_flex, like_dislike_fp_flex, like_dislike_fn_flex))
    print('FLEX: Rating Accuracy: {}, Binary Accuracy (Like/Dislike) {}'.format(rating_accuracy_flex, like_dislike_accuracy_flex))
    return

In [87]:
def merge_shuffle_split(split=1.0):

    movies_mod = pd.read_csv('data_/movies_mod.csv')
    ratings = pd.read_csv('data_/ratings_subset.csv')

    ratings = shuffle(ratings)

    selection_range = int(len(ratings) * (split))
    ratings = ratings.iloc[: selection_range, :]

    ratings_movies = ratings.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    del movies_mod

    like_genres = pd.read_csv('data_/final/like_genres.csv')

    like_columns = list(like_genres.columns)
    like_columns_modified = []

    for column in like_columns:
        if column == 'userId':
            like_columns_modified.append('userId')
        else:
            modify_column = 'user_like_' + column
            like_columns_modified.append(modify_column)

    like_genres.columns = like_columns_modified

    ratings_movies = ratings_movies.merge(like_genres, how= 'left', on= 'userId').dropna()
    del like_genres


    dislike_genres = pd.read_csv('data_/final/dislike_genres.csv')

    dislike_columns = list(dislike_genres.columns)
    dislike_columns_modified = []

    for column in dislike_columns:
        if column == 'userId':
            dislike_columns_modified.append('userId')
        else:
            modify_column = 'user_dislike_' + column
            dislike_columns_modified.append(modify_column)

    dislike_genres.columns = dislike_columns_modified

    ratings_movies = ratings_movies.merge(dislike_genres, how= 'left', on= 'userId').dropna()

    del dislike_genres

    movie_tags_df = pd.read_csv('data_/final/movie_tags_df.csv')
    ratings_movies = ratings_movies.merge(movie_tags_df, how= 'left', on= 'movieId').dropna()
    del movie_tags_df

    like_dislike_tags = (pd.read_csv('data_/final/like_dislike_tags.csv')).astype('int64')
    ratings_movies = ratings_movies.merge(like_dislike_tags, how= 'left', on= 'userId').dropna()
    del like_dislike_tags

    like_columns_modified.remove('userId')
    dislike_columns_modified.remove('userId')
    like_columns.remove('userId')

    genres_like = ratings_movies.loc[:, like_columns_modified]
    genres_dislike = ratings_movies.loc[:, dislike_columns_modified]
    genres_movie = ratings_movies.loc[:, like_columns]

    rf_columns = []
    for x in range(20):
        rf_columns.append('LIKE_' + str(x))
        rf_columns.append('DISLIKE_' + str(x))
    for x in range(5):
        rf_columns.append('TAG_' + str(x))

    rf_input = ratings_movies.loc[:, rf_columns]

    ratings = list(ratings_movies.rating)

    del ratings_movies

    return genres_like, genres_dislike, genres_movie, rf_input, ratings

In [82]:
# creating neural network model (genres)

user_liked_genres = keras.Input(shape= (19,))
user_disliked_genres = keras.Input(shape= (19,))
movie_genres = keras.Input(shape= (19,))

# liked genres input
liked_input = keras.layers.Dense(19, activation= 'relu')(user_liked_genres)
liked_hidden_1 = keras.layers.Dense(50, activation= 'relu')(liked_input)
liked_hidden_2 = keras.layers.Dense(20, activation= 'relu')(liked_hidden_1)

# disliked genres input
disliked_input = keras.layers.Dense(19, activation= 'relu')(user_disliked_genres)
disliked_hidden_1 = keras.layers.Dense(50, activation= 'relu')(disliked_input)
disliked_hidden_2 = keras.layers.Dense(20, activation= 'relu')(disliked_hidden_1)

# movie genres input
movie_input = keras.layers.Dense(19, activation= 'relu')(movie_genres)
movie_hidden_1 = keras.layers.Dense(50, activation= 'relu')(movie_input)
movie_hidden_2 = keras.layers.Dense(20, activation= 'relu')(movie_hidden_1)

# merging
merged_model = keras.layers.concatenate([liked_hidden_2, disliked_hidden_2, movie_hidden_2])
merged_model_hidden_1 = keras.layers.Dense(150, activation= 'relu')(merged_model)
merged_model_hidden_2 = keras.layers.Dense(75, activation= 'relu')(merged_model_hidden_1)
merged_model_hidden_3 = keras.layers.Dense(50, activation= 'relu')(merged_model_hidden_2)

# output layer
output_rating = keras.layers.Dense(1, activation= 'sigmoid')(merged_model_hidden_3)

# creating model
genres_model = keras.Model(inputs= [user_liked_genres, user_disliked_genres, movie_genres], outputs= output_rating)
genres_model.compile(optimizer= keras.optimizers.Adam(learning_rate=0.001), loss= 'mean_squared_error')

In [83]:
# creating random forest model (tags)
random_forest = RandomForestRegressor(n_estimators=100, max_features='sqrt', verbose=2, random_state=True, n_jobs= -1, max_depth=15)

In [88]:
# creating input datasets (calling merge_shuffle_split)
genres_like, genres_dislike, genres_movie, rf_input, ratings = merge_shuffle_split()

In [89]:
# train test splitting
train_split = 0.95
split_index = int(len(ratings) * train_split)
# neural network
genres_like_train = genres_like.iloc[: split_index, :]
genres_like_test = genres_like.iloc[split_index :, :]
del genres_like
genres_dislike_train = genres_dislike.iloc[: split_index, :]
genres_dislike_test = genres_dislike.iloc[split_index :, :]
del genres_dislike
genres_movie_train = genres_movie.iloc[: split_index, :]
genres_movie_test = genres_movie.iloc[split_index :, :]
del genres_movie
ratings_scaled = np.array(ratings)/5
ratings_scaled_train = ratings_scaled[: split_index]
ratings_scaled_test = ratings_scaled[split_index :]
# random forest
rf_input_train = rf_input.iloc[: split_index, :]
rf_input_test = rf_input.iloc[split_index :, :]
ratings_train = ratings[: split_index]
ratings_test = ratings[split_index :]

In [90]:
# fitting neural network
batch_size = 500
epochs = 10

# def scheduler(epoch):
#     if epoch < 5:
#         return 0.001
#     else:
#         return 0.001 * math.exp(0.1 * (5 - epoch))
# Learning_Rate_Callback = keras.callbacks.LearningRateScheduler(scheduler)

# class Save_Progress_Callback(keras.callbacks.Callback):
#     def on_epoch_end(self, epoch, logs=None): ## Saving and printing after each epoch
#         lr = float(keras.backend.get_value(self.model.optimizer.learning_rate))
#         print("Epoch {}; Loss {:7.3f}; Val loss {:7.3f}; Learning rate {}.".format(epoch, logs["loss"], logs["val_loss"], lr))

# callbacks=[Learning_Rate_Callback, Save_Progress_Callback()]

genres_model.fit(x=[genres_like_train, genres_dislike_train, genres_movie_train],
                 y=ratings_scaled_train,
                 epochs=epochs,
                 verbose=0,
                 batch_size=batch_size,
                 validation_split=0.2,
                 shuffle= True)

genres_model.save('models_/genres_model.h5', overwrite=True, include_optimizer=True)

  saving_api.save_model(


In [91]:
# fitting random_forest
random_forest.fit(rf_input_train, ratings_train)
print(random_forest.score(rf_input_test, ratings_test))

joblib.dump(random_forest, open('models_/tags_model.sav', 'wb'), compress=7)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.1s


building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.3s finished


0.14123288429358993


In [92]:
genres_model_predictions = (genres_model.predict(x= [genres_like_test, genres_dislike_test, genres_movie_test])) * 5 # Rescale back to original values
random_forest_predict = random_forest.predict(rf_input_test)



[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.3s finished


In [93]:
# creating input for the combined model (linear regression)
genres_model_predictions_list = []
for prediction in genres_model_predictions:
    genres_model_predictions_list.append(prediction[0])

merged_predictions = pd.DataFrame({'genres_model': genres_model_predictions_list,
                                   'tag_model': list(random_forest_predict),
                                   'genres_true': list(np.array(list(ratings_scaled_test)) * 5),
                                   'tag_true': ratings_test},
                                  index= list(range(len(ratings_test))))

X = merged_predictions.loc[:, ['genres_model', 'tag_model']]
y = np.array(merged_predictions.loc[:, 'genres_true'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# fitting combined model
line_reg = LinearRegression(n_jobs=-1).fit(X_train, y_train)

line_reg_predictions = line_reg.predict(X_test)

joblib.dump(line_reg, open('models_/combine_model.sav', 'wb'), compress=7)

# rounding predictions that are out of bounds
line_reg_predictions_rounded = []

for prediction in line_reg_predictions:
    rounded = prediction
    if rounded > 5:
        rounded = 5
    elif rounded < 0.5:
        rounded = 0.5

    line_reg_predictions_rounded.append(rounded)

In [94]:
print('Genres model stats:')
stats(genres_model_predictions, ratings_scaled_test*5)
print('')

print('Tags model stats:')
stats(random_forest_predict, ratings_test)
print('')

print('Combined model stats:')
print('Linear Regression R2:', line_reg.score(X_test, y_test))
print('Linear Regression coefficients - ', f'{line_reg.feature_names_in_[0]}: {line_reg.coef_[0]}', f'{line_reg.feature_names_in_[1]}: {line_reg.coef_[1]}')
stats(line_reg_predictions_rounded, y_test)

Genres model stats:
True Positive: 23252, True Negative: 89943, False Positive 9090, False Negative 45886
Rating Accuracy: 0.2464872064743624, Binary Accuracy (Like/Dislike) 0.6730946477097716
FLEX: True Positive: 76416, True Negative: 39485, False Positive 26967, False Negative 25303
FLEX: Rating Accuracy: 0.6444036129891598, Binary Accuracy (Like/Dislike) 0.6891854124670722

Tags model stats:
True Positive: 4541, True Negative: 97929, False Positive 1104, False Negative 64597
Rating Accuracy: 0.21187957495644313, Binary Accuracy (Like/Dislike) 0.6093202751960802
FLEX: True Positive: 91206, True Negative: 21049, False Positive 45403, False Negative 10513
FLEX: Rating Accuracy: 0.626130545694561, Binary Accuracy (Like/Dislike) 0.6675050989766369

Combined model stats:
Linear Regression R2: 0.25059299520547607
Linear Regression coefficients -  genres_model: 0.8004878000499859 tag_model: 0.5893900881623934
True Positive: 6743, True Negative: 21997, False Positive 2636, False Negative 106

In [125]:
def top_10_recommendations(userId, movie_list):
    movies_mod = pd.read_csv('data_/movies_mod.csv')
    ratings = pd.read_csv('data_/ratings_subset.csv')

    # identifying not watched movies
    not_watched = movie_list.copy()
    ratings = ratings[ratings.userId == userId]

    if len(ratings) ==  0:
        return print('User {} does not have enough information. 1'.format(userId))

    ratings_movies = ratings.merge(movies_mod, how= 'left', on= 'movieId').dropna()

    watched = list(ratings_movies.movieId)
    del ratings_movies

    for movie in watched:
        if movie in not_watched:
            not_watched.remove(movie)

    # genres
    like_genres = pd.read_csv('data_/final/like_genres.csv')
    dislike_genres = pd.read_csv('data_/final/dislike_genres.csv')

    like_genres = like_genres[like_genres.userId == userId]
    if len(like_genres) ==  0:
        return print('User {} does not have enough information. 3'.format(userId))

    dislike_genres = dislike_genres[dislike_genres.userId == userId]
    if len(dislike_genres) ==  0:
        return print('User {} does not have enough information. 4'.format(userId))

    # changing column names to differenciate liked genres and disliked genres, and movie genres
    like_columns = list(like_genres.columns)
    like_columns_modified = []
    for column in like_columns:
        if column == 'userId':
            like_columns_modified.append('userId')
        else:
            modify_column = 'user_like_' + column
            like_columns_modified.append(modify_column)
    like_genres.columns = like_columns_modified

    dislike_columns = list(dislike_genres.columns)
    dislike_columns_modified = []
    for column in dislike_columns:
        if column == 'userId':
            dislike_columns_modified.append('userId')
        else:
            modify_column = 'user_dislike_' + column
            dislike_columns_modified.append(modify_column)

    dislike_genres.columns = dislike_columns_modified

    # tags
    movie_tags_df = pd.read_csv('data_/final/movie_tags_df.csv')
    like_dislike_tags = (pd.read_csv('data_/final/like_dislike_tags.csv')).astype('int64')

    # adding a column with all not watched movies, then merging movie information (genres and tags profiles of movies)
    template_df = pd.DataFrame({'movieId': not_watched}, index= list(range(len(not_watched))))
    template_df = template_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    template_df = template_df.merge(movie_tags_df, how= 'left', on= 'movieId').dropna()
    del movie_tags_df

    # selecting the users tags
    like_dislike_tags = like_dislike_tags[like_dislike_tags.userId == userId]
    if len(like_dislike_tags) ==  0:
        return print('User {} does not have enough information. 5'.format(userId))

    # adding a column with the userId on all rows, then merging user information (genres and tags profiles of users)
    template_df['userId'] = userId
    template_df = template_df.merge(like_genres, how= 'left', on= 'userId').dropna()
    del like_genres
    template_df = template_df.merge(dislike_genres, how= 'left', on= 'userId').dropna()
    del dislike_genres
    template_df = template_df.merge(like_dislike_tags, how= 'left', on= 'userId').dropna()
    del like_dislike_tags

    like_columns_modified.remove('userId')
    dislike_columns_modified.remove('userId')
    like_columns.remove('userId')

    # generating the columns for the random forest input
    rf_columns = []
    for x in range(20):
        rf_columns.append('LIKE_' + str(x))
        rf_columns.append('DISLIKE_' + str(x))
    for x in range(5):
        rf_columns.append('TAG_' + str(x))

    # separating the 3 inputs for the neural network
    genres_like_input = template_df.loc[:, like_columns_modified]
    genres_dislike_input = template_df.loc[:, dislike_columns_modified]
    genres_movie_input = template_df.loc[:, like_columns]

    # separating the input for the random forest
    tags_input = template_df.loc[:, rf_columns]

    # saving a list with the not-watched movieIds
    movieId_list = list(template_df.movieId)

    del template_df

    # loading models
    genres_model = keras.models.load_model('models_/genres_model.h5', compile=True)
    tags_model = joblib.load(open('models_/tags_model.sav', 'rb'))
    combine_model = joblib.load(open('models_/combine_model.sav', 'rb'))

    # predicting with the genres and tags models
    genres_model_predictions = (genres_model.predict(x= [genres_like_input, genres_dislike_input, genres_movie_input])) * 5 ## Rescaling up; predicts a scaled and bound (sigmoid, 0-1) values
    tags_model_predictions = tags_model.predict(tags_input)

    # transforming the neural network prediction into a list
    genres_model_predictions_list = []
    for prediction in genres_model_predictions:
        genres_model_predictions_list.append(prediction[0])

    # using both predictions to predict with the combined model
    combine_input = pd.DataFrame({'genres_model': genres_model_predictions_list,
                                  'tag_model': tags_model_predictions},
                                 index= list(range(len(genres_model_predictions))))
    combine_model_predictions = combine_model.predict(combine_input)

    # rounding predictions that end up out of bounds
    combine_model_predictions_rounded = []
    for prediction in combine_model_predictions:
        rounded = prediction
        if rounded > 5:
            rounded = 5
        elif rounded < 0.5:
            rounded = 0.5
        combine_model_predictions_rounded.append(rounded)

    # creating dataframe with predictions
    # predictions_df = pd.DataFrame({'movieId': movieId_list,
    #                                'genres_predictions': genres_model_predictions_list,
    #                               'tags_predictions': tags_model_predictions,
    #                               'combine_predictions': combine_model_predictions_rounded},
    #                              index= list(range(len(movieId_list))))
    predictions_df = pd.DataFrame({'movieId': movieId_list,
                              'prediction': combine_model_predictions_rounded},
                              index= list(range(len(movieId_list))))

    # getting top and bottom predictions
    best_movies_df = predictions_df.sort_values(by=['prediction'], ascending=False).iloc[:20, :]
    worst_movies_df = predictions_df.sort_values(by=['prediction'], ascending=True).iloc[:20, :]

    # adding rest of information about the movie
    best_movies_df = best_movies_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    worst_movies_df = worst_movies_df.merge(movies_mod, how= 'left', on= 'movieId').dropna()
    del movies_mod

    return predictions_df, best_movies_df, worst_movies_df

In [154]:
print(chosen_movies_list)

[260, 1270, 1196, 1198, 1210, 1240, 1291, 1036, 858, 2716, 1214, 1097, 1136, 541, 1258, 1200, 592, 1197, 2115, 1193, 1206, 924, 2011, 1961, 2918, 1387, 1222, 1221, 919, 1208, 1246, 1968, 2174, 2987, 111, 2797, 1073, 1101, 2000, 1259, 750, 2791, 1080, 1219, 1220, 1307, 912, 1954, 1079, 3527, 2571, 356, 296, 593, 318, 480, 1580, 2762, 1, 2858, 2959, 589, 1265, 3578, 780, 110, 47, 2028, 50, 1682, 608, 1721, 527, 1527, 1704, 32, 2628, 4226, 3793, 377, 364, 1089, 457, 648, 150, 1732, 367, 588, 2997, 3147, 3996, 1923, 380, 293, 4022, 500, 1517, 733, 1617, 2683, 344, 3114, 4027, 2706, 778, 1917, 2329, 1573, 1653, 231, 1584, 1784, 3948, 4011, 595, 2617, 1393, 736, 165, 2, 4993, 4306, 5952, 7153, 6539, 5349, 4963, 5445, 6377, 6874, 4886, 4995, 5418, 5989, 4896, 33794, 7438, 8961, 6365, 7361, 5378, 4973, 4878, 58559, 8636, 5816, 6333, 44191, 32587, 6711, 8360, 8368, 33493, 6934, 79132, 8665, 5218, 59315, 8874, 48516, 8644, 5459, 60069, 49272, 7147, 5502, 6373, 48780, 72998, 4370, 4979, 6502, 686

In [148]:
user_list[15]

'56618'

In [149]:
predictions_df, best_movies_df, worst_movies_df = top_10_recommendations(56618, chosen_movies_list)



[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [150]:
len(predictions_df)

349

In [151]:
best_movies_df

Unnamed: 0,movieId,prediction,title,genres,YEAR,Thriller,Horror,Action,Romance,Comedy,...,Adventure,FilmNoir,Crime,Western,Animation,Drama,War,Musical,Documentary,None
0,60069,4.122113,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,2008,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
1,134853,4.066248,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,2015,0,0,0,0,1,...,1,0,0,0,1,1,0,0,0,0
2,163134,4.010514,Your Name. (2016),Animation|Drama|Fantasy|Romance,2016,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
3,68954,3.974511,Up (2009),Adventure|Animation|Children|Drama,2009,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
4,87222,3.96047,Kung Fu Panda 2 (2011),Action|Adventure|Animation|Children|Comedy|IMAX,2011,0,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
5,152081,3.949249,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,2016,0,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
6,108932,3.941809,The Lego Movie (2014),Action|Adventure|Animation|Children|Comedy|Fan...,2014,0,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
7,166461,3.928918,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy,2016,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
8,99114,3.89388,Django Unchained (2012),Action|Drama|Western,2012,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
9,163645,3.857891,Hacksaw Ridge (2016),Drama|War,2016,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
