# Content Based Movie Recommendation Project Using Neural Networks

In [1]:
#!pip install pandas

In [2]:
#!pip install sklearn

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### upload the movies dataset

In [2]:
movie = pd.read_csv("C:/Users/Cagan Deliktas/Desktop/Machine-Learning-Spec/C3/W2/content_based_filtering_lab/Files/home/jovyan/work/data/movies.csv")
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movie.shape

(9742, 3)

### create a new column named "year".

In [5]:
def splitt_year(value):
    value_list = value.split(" ")
    year = value_list[-1]
    year = year[1:-1]
    
    try:
        year = int(year)
    except:
        year = 0
    
    return year

In [6]:
movie["year"] = movie["title"].apply(splitt_year)
movie.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [7]:
movie = movie.loc[movie["year"] != 0, :]

In [8]:
movie.shape

(9718, 4)

In [9]:
movie.isnull().sum()

movieId    0
title      0
genres     0
year       0
dtype: int64

In [10]:
movie.nunique()

movieId    9718
title      9713
genres      951
year        106
dtype: int64

In [11]:
rating = pd.read_csv("C:/Users/Cagan Deliktas/Desktop/Machine-Learning-Spec/C3/W2/content_based_filtering_lab/Files/home/jovyan/work/data/ratings.csv")
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
rating.nunique()

userId         610
movieId       9724
rating          10
timestamp    85043
dtype: int64

In [13]:
rating.shape

(100836, 4)

In [14]:
rating.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### combining 2 datasets

In [15]:
df = movie.merge(rating, how="inner", on="movieId")
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,1305696483


In [16]:
df.sort_values(by=["movieId","userId"], ascending = True, inplace = True)

In [17]:
df["movieId"].nunique()

9700

In [18]:
df["userId"].nunique()

610

In [19]:
df.isnull().sum()

movieId      0
title        0
genres       0
year         0
userId       0
rating       0
timestamp    0
dtype: int64

### select movies that are released after the year 2000.

In [20]:
df = df.loc[df["year"] >= 2000, :]

In [21]:
df.nunique()

movieId       4771
title         4768
genres         684
year            19
userId         443
rating          10
timestamp    38496
dtype: int64

In [22]:
df.shape

(39244, 7)

### select the movies that are rated by more than 20 users.

In [23]:
false_true = (df.groupby(["movieId"]).agg({"userId":"count"}) > 20).reset_index()

In [25]:
movie_ids_greater = false_true.loc[false_true["userId"] == True, ["movieId"]]

In [26]:
movie_ids_greater = movie_ids_greater.values

In [27]:
movie_ids_greater = movie_ids_greater.flatten()

In [28]:
movie_ids_greater.shape

(482,)

In [29]:
df = df.loc[df["movieId"].isin(movie_ids_greater),:]

In [30]:
df.nunique()

movieId        482
title          482
genres         215
year            18
userId         442
rating          10
timestamp    22489
dtype: int64

In [31]:
df.sort_values(by=["movieId","userId"], ascending = True, inplace = True)

## genre list

In [33]:
df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,1,5.0,964983536
53191,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,19,2.0,965703084
53192,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,68,2.0,1158535390


In [34]:
movie.reset_index(inplace = True, drop=True)

In [35]:
genre_list = []

for i in range(movie.shape[0]):
    row_list = movie.loc[i, "genres"].split("|")
    
    for x in row_list:
        if x not in genre_list:
            genre_list.append(x)

In [36]:
print(genre_list)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir', '(no genres listed)']


In [37]:
genre_list.pop(len(genre_list)-1)

'(no genres listed)'

In [38]:
print(genre_list)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir']


In [39]:
len(genre_list)

19

In [40]:
#"Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Horror","Mystery","Romance","Sci-Fi","Thriller"

genre_list = [x for x in genre_list if x in ["Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Horror","Mystery","Romance","Sci-Fi","Thriller"]]
len(genre_list)

14

### average ratings column

In [41]:
average_ratings = df.groupby(["movieId"]).agg({"rating": np.mean})
average_ratings.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
3273,2.672414
3275,4.22093
3285,3.0625
3298,3.714286
3300,3.564103


In [42]:
np.array(average_ratings.rating).shape

(482,)

### year list

In [43]:
year_list = df.groupby(["movieId"]).agg({"year":np.mean}).values.flatten()

In [44]:
#df.loc[df["movieId"] == 193609,:]

### 1-0 genre matrix

In [45]:
df.iloc[0,2].split("|")

['Comedy', 'Horror', 'Mystery', 'Thriller']

In [46]:
df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,1,5.0,964983536
53191,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,19,2.0,965703084
53192,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,68,2.0,1158535390


In [47]:
movie_genre_dict = {}
movie_id_listt = []

for i in range(df.shape[0]):
    
    
    if df.iloc[i,0] not in movie_id_listt:
        
        movie_id_listt.append(df.iloc[i,0])
        
        row_list = df.iloc[i, 2].split("|")
        binary_genre = []
        
        for x in genre_list: 
            if x in row_list:
                binary_genre.append(1)
                
            else: 
                binary_genre.append(0)
        
        movie_genre_dict[df.iloc[i,0]] = binary_genre

In [48]:
len(movie_genre_dict)

482

### converting movie_genre_dict into 2d list

In [49]:
two_d_bin_list = []
for key in movie_genre_dict.keys():
    two_d_bin_list.append(movie_genre_dict[key])

two_d_bin_list = np.array(two_d_bin_list)
# two_d_bin_list[0] = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [50]:
two_d_bin_list.shape

(482, 14)

### item train df

In [51]:
item_train = pd.DataFrame({"movieId": df.movieId.unique(),
              "year": year_list,
              "ave_rating": np.array(average_ratings.rating)})

In [52]:
item_train['ave_rating'] = item_train['ave_rating'].apply(lambda x: float("{:.1f}".format(x)))

In [53]:
for i, col in enumerate(genre_list):
    item_train[col] = two_d_bin_list[:,i]

In [54]:
item_train_columns = item_train.columns

In [55]:
item_train.head()

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
1,3275,2000.0,4.2,0,0,0,0,0,0,1,1,1,1,0,0,0,0
2,3285,2000.0,3.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3298,2000.0,3.7,0,0,0,0,0,0,1,0,1,1,0,0,0,0
4,3300,2000.0,3.6,0,0,0,0,0,0,0,0,0,1,1,0,1,0


In [56]:
item_train.shape

(482, 17)

##### example movie

In [57]:
item_train.loc[item_train["movieId"] == 46970, :]

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
288,46970,2006.0,3.2,0,0,0,1,0,0,0,1,0,0,0,0,0,0


In [58]:
item_train.nunique()

movieId        482
year            18
ave_rating      21
Adventure        2
Animation        2
Children         2
Comedy           2
Fantasy          2
Romance          2
Drama            2
Action           2
Crime            2
Thriller         2
Horror           2
Mystery          2
Sci-Fi           2
Documentary      2
dtype: int64

In [59]:
item_train.sort_values(by = ["movieId"], ascending = True, inplace = True)

In [60]:
item_train.head()

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
1,3275,2000.0,4.2,0,0,0,0,0,0,1,1,1,1,0,0,0,0
2,3285,2000.0,3.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3298,2000.0,3.7,0,0,0,0,0,0,1,0,1,1,0,0,0,0
4,3300,2000.0,3.6,0,0,0,0,0,0,0,0,0,1,1,0,1,0


### rating count column for users

In [61]:
df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,1,5.0,964983536
53191,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,19,2.0,965703084
53192,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,68,2.0,1158535390


In [62]:
rating_count = df.groupby(["userId"]).agg({"movieId": "count"})
rating_count.reset_index(inplace=True)
rating_count.columns = ["userId","movie_count"]
rating_count.shape

(442, 2)

### rating average for users

In [63]:
rating_av = df.groupby(["userId"]).agg({"rating": "mean"})
rating_av.reset_index(inplace=True, drop=True)
rating_av.columns = ["rating_ave"]
rating_av.shape

(442, 1)

### average per genre for each user

In [65]:
def splitt(value):
    return value.split("|")

In [66]:
df_modified = df.copy()

In [67]:
df_modified["genres"] = df["genres"].apply(splitt)

In [69]:
df_modified.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,1,5.0,964983536
53191,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,19,2.0,965703084
53192,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,68,2.0,1158535390


In [70]:
print(min(df_modified.userId))
print(max(df_modified.userId))

1
610


### outer for loop looks at every genre. in the first inner loop, we took the indexes of the rows which have that genre. using these indexes we create a data frame df_temp. this data frame consists of the rows that have corresponding genre from the outer loop. since this data frame does not have every userId, we add missing user id's as dummy rows to this dataframe. Lastly, using genre_dict_user, we store the average ratings of the genres per user. genre_dict_user's keys are the genres. values are lists. each key has a corresponding value list. these value lists consist of average rating of the users for corresponding genre. 

In [72]:
pd.DataFrame(df_modified).head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,1,5.0,964983536
53191,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,19,2.0,965703084
53192,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),"[Comedy, Horror, Mystery, Thriller]",2000,68,2.0,1158535390


In [73]:
genre_dict_user = {}
step = 0

for genre in genre_list:
    genre_index_list = []
    
    for i in range(df_modified.shape[0]):
        if genre in df_modified.iloc[i, 2]:
            genre_index_list.append(i)
    
    df_temp = df_modified.iloc[(genre_index_list), :]
    
    for id in df_modified.userId.values:
        if id not in df_temp.userId.values:
            #print("id not in temp df")
            new_row = {"movieId":0, "title": "null", "genres":"null", "userId":id, "rating":0,"timestamp":0}
            new_df = pd.DataFrame(new_row, index = [0])
            df_temp = pd.concat([df_temp, new_df], ignore_index = True)
            #df_temp.reset_index(inplace=True)
    
    genre_dict_user[genre] = df_temp.groupby(["userId"]).agg({"rating": "mean"}).values.flatten()
    step += 1
    print(step)

1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [74]:
pd.DataFrame(genre_dict_user).head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,5.0,0.0,0.0,4.5,0.0,0.0,5.0,5.0,0.0,5.0,5.0,5.0,5.0,0.0
1,4.166667,0.0,0.0,4.25,0.0,0.0,3.961538,3.954545,4.125,3.888889,3.0,4.0,3.875,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.5,0.0
3,3.5,0.0,4.0,2.833333,4.0,2.4,3.25,4.0,4.333333,3.5,0.0,2.0,0.0,0.0
4,3.209677,4.071429,3.625,3.5,4.045455,2.722222,3.08,2.96875,3.5,3.238095,2.75,3.142857,2.738095,0.0


In [75]:
user_train = pd.concat([rating_count, rating_av, pd.DataFrame(genre_dict_user)], axis = 1)

In [76]:
user_train_columns = user_train.columns

In [77]:
user_train.loc[:, (user_train.columns != "userId") & (user_train.columns != "movie_count")] = user_train.loc[:,(user_train.columns != "userId") & (user_train.columns != "movie_count")].applymap(lambda x: float("{:.1f}".format(x)))

In [78]:
user_train.head()

Unnamed: 0,userId,movie_count,rating_ave,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,4,4.8,5.0,0.0,0.0,4.5,0.0,0.0,5.0,5.0,0.0,5.0,5.0,5.0,5.0,0.0
1,2,20,4.0,4.2,0.0,0.0,4.2,0.0,0.0,4.0,4.0,4.1,3.9,3.0,4.0,3.9,0.0
2,3,2,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.5,0.0
3,4,16,3.2,3.5,0.0,4.0,2.8,4.0,2.4,3.2,4.0,4.3,3.5,0.0,2.0,0.0,0.0
4,7,69,3.2,3.2,4.1,3.6,3.5,4.0,2.7,3.1,3.0,3.5,3.2,2.8,3.1,2.7,0.0


In [79]:
item_train.head()

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
1,3275,2000.0,4.2,0,0,0,0,0,0,1,1,1,1,0,0,0,0
2,3285,2000.0,3.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3298,2000.0,3.7,0,0,0,0,0,0,1,0,1,1,0,0,0,0
4,3300,2000.0,3.6,0,0,0,0,0,0,0,0,0,1,1,0,1,0


In [80]:
print(user_train.shape)
print(item_train.shape)

(442, 17)
(482, 17)


In [81]:
print(user_train.shape[1])
print(item_train.shape[1])

17
17


In [82]:
print(df.shape)
print(user_train.shape)
print(item_train.shape)

(22863, 7)
(442, 17)
(482, 17)


In [83]:
movie_ids_sorted = df["movieId"].values
user_ids_sorted = df["userId"].values

### duplicating item df

In [85]:
count = 0
duplicated_item_train = pd.DataFrame()
for i in movie_ids_sorted:
    count += 1
    duplicated_item_train = pd.concat([duplicated_item_train, item_train.loc[item_train["movieId"] == i, :]], axis = 0)

In [86]:
duplicated_item_train.shape

(22863, 17)

### duplicating user df

In [87]:
count = 0
duplicated_user_train = pd.DataFrame()
for i in user_ids_sorted:
    count += 1
    duplicated_user_train = pd.concat([duplicated_user_train, user_train.loc[user_train["userId"] == i, :]], axis = 0)

In [88]:
duplicated_user_train.head()

Unnamed: 0,userId,movie_count,rating_ave,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,1,4,4.8,5.0,0.0,0.0,4.5,0.0,0.0,5.0,5.0,0.0,5.0,5.0,5.0,5.0,0.0
13,19,21,2.4,3.2,3.5,3.5,2.3,0.0,2.3,2.4,3.0,2.0,2.1,1.8,2.0,3.0,0.0
29,45,70,4.5,4.5,4.7,4.6,4.4,4.4,4.5,4.3,4.6,4.7,4.5,4.7,4.6,4.5,0.0
43,64,143,3.7,3.8,4.1,3.9,3.6,4.0,3.6,3.7,3.7,3.6,3.7,3.6,3.7,3.4,3.8
47,68,382,3.5,3.7,3.6,3.8,3.5,3.9,3.6,3.5,3.6,3.4,3.4,3.3,3.2,3.6,3.7


In [89]:
duplicated_item_train.head()

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0


In [90]:
user_train = duplicated_user_train.copy()
item_train = duplicated_item_train.copy()

In [91]:
print(user_train.shape)
print(item_train.shape)

(22863, 17)
(22863, 17)


In [92]:
user_train.columns

Index(['userId', 'movie_count', 'rating_ave', 'Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'Documentary'], dtype='object')

In [93]:
item_train = item_train[['movieId', 'year', 'ave_rating', "Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Horror","Mystery","Romance","Sci-Fi","Thriller"]]

In [94]:
user_train = user_train[['userId', 'movie_count', 'rating_ave', "Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Horror","Mystery","Romance","Sci-Fi","Thriller"]]

In [95]:
df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
53190,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,1,5.0,964983536
53191,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,19,2.0,965703084
53192,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,45,4.0,1007995150
53193,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,64,2.0,1161620569
53194,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller,2000,68,2.0,1158535390


In [96]:
df.rating.values

array([5. , 2. , 4. , ..., 5. , 3.5, 5. ])

In [97]:
y_train = df.rating.values

In [98]:
y_train.shape

(22863,)

# modeling section

In [99]:
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
#uvs = 3  # user genre vector start
#ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

print(num_user_features)
print(num_item_features)

14
16


In [100]:
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

In [101]:
user_train.iloc[:, u_s:].isnull().sum()

Action         0
Adventure      0
Animation      0
Children       0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Horror         0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
dtype: int64

In [102]:
item_train.iloc[:, i_s:].isnull().sum()

year           0
ave_rating     0
Action         0
Adventure      0
Animation      0
Children       0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Horror         0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
dtype: int64

# data frame'leri 2d array'e çevir

In [103]:
item_train = item_train.to_numpy()
user_train = user_train.to_numpy()

In [104]:
scalerItem = StandardScaler()
scalerItem.fit(item_train[:, i_s:])
item_train[:, i_s:] = scalerItem.transform(item_train[:, i_s:])

scalerUser = StandardScaler()
scalerUser.fit(user_train[:, u_s:])
user_train[:, u_s:] = scalerUser.transform(user_train[:, u_s:])

In [105]:
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))

In [106]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (18290, 17)
movie/item test data shape: (4573, 17)


### create 2 neural networks (one of them is for items/movies, the other one is for the users), both with 3 layers. you feed the dataframes you prepared above to these neural networks. In return, they calculate the feature vectors for movies (Vm) and users (Vu). using these feature vectors, we make predictions. Vm*Vu gives you the rating of a user for that movie. 

In [107]:
num_outputs = 32
tf.random.set_seed(1)

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

item_NN = tf.keras.models.Sequential([  
      tf.keras.layers.Dense(256, activation='relu'),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(num_outputs)
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           40864       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           41376       ['input_2[0][0]']                
                                                                                              

In [108]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [109]:
tf.random.set_seed(1)
model.fit([tf.convert_to_tensor(user_train[:, u_s:], dtype=tf.float32), tf.convert_to_tensor(item_train[:, i_s:],dtype=tf.float32)], tf.convert_to_tensor(y_train, dtype=tf.float32), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x253b13ade20>

### model evaluation

In [110]:
model.evaluate([tf.convert_to_tensor(user_test[:, u_s:], dtype=tf.float32), tf.convert_to_tensor(item_test[:, i_s:],dtype=tf.float32)], tf.convert_to_tensor(y_test, dtype=tf.float32))



0.11040615290403366

### prediction

In [111]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count,new_rating_ave, 
new_adventure,new_animation,new_childrens,new_comedy,
new_fantasy,new_romance,new_drama,new_action,new_crime,
new_thriller,new_horror,new_mystery,new_scifi,new_documentary]])

In [112]:
item_train = duplicated_item_train.copy()

In [150]:
item_train.sort_values(by = ["movieId"], ascending = True).head()

Unnamed: 0,movieId,year,ave_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
0,3273,2000.0,2.7,0,0,0,1,0,0,0,0,0,1,1,1,0,0
1,3275,2000.0,4.2,0,0,0,0,0,0,1,1,1,1,0,0,0,0
2,3285,2000.0,3.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3298,2000.0,3.7,0,0,0,0,0,0,1,0,1,1,0,0,0,0
4,3300,2000.0,3.6,0,0,0,0,0,0,0,0,0,1,1,0,1,0


In [114]:
item_train.drop_duplicates(inplace = True)

In [115]:
item_train = item_train.to_numpy()

### duplicate user vector to match it's size to item_train as we are going to feed these into our NN model as inputs (item_train, user_vecs)

In [116]:
user_vecs = pd.DataFrame(user_vec, columns = user_train_columns).sample(item_train.shape[0], replace = True, random_state = 1)

In [117]:
user_vecs.shape

(482, 17)

In [118]:
user_vecs = user_vecs.to_numpy()

In [119]:
item_vecs = item_train

In [120]:
suser_vecs = scalerUser.transform(user_vecs[:, u_s:])
sitem_vecs = scalerItem.transform(item_vecs[:, i_s:])

In [121]:
y_p = model.predict([suser_vecs, sitem_vecs])
y_pu = scalerTarget.inverse_transform(y_p)
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first, argsotf orders ascanding (returns the indexes).
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display



In [122]:
pd_pred_df = pd.DataFrame(sorted_items, columns = item_train_columns)
pd_pred_df["preds"] = sorted_ypu
pd_pred_df = pd_pred_df.merge(df.loc[:, ["movieId","title","genres"]], on = "movieId", how = "inner")
pd_pred_df = pd_pred_df[["movieId","ave_rating","preds","title","genres"]]
pd_pred_df = pd_pred_df.drop_duplicates()
pd_pred_df_new = pd_pred_df.copy()
pd_pred_df_new.head(15)

Unnamed: 0,movieId,ave_rating,preds,title,genres
0,5816.0,3.6,4.428673,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
102,106489.0,3.6,4.404529,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
127,30810.0,3.5,4.398796,"Life Aquatic with Steve Zissou, The (2004)",Adventure|Comedy|Fantasy
164,7373.0,3.4,4.369652,Hellboy (2004),Action|Adventure|Fantasy|Horror
205,98809.0,3.8,4.342022,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy|IMAX
245,54259.0,3.6,4.260365,Stardust (2007),Adventure|Comedy|Fantasy|Romance
279,86880.0,3.2,4.256057,Pirates of the Caribbean: On Stranger Tides (2...,Action|Adventure|Fantasy|IMAX
303,53125.0,3.4,4.25513,Pirates of the Caribbean: At World's End (2007),Action|Adventure|Comedy|Fantasy
359,106072.0,3.3,4.253738,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX
380,74789.0,2.9,4.243067,Alice in Wonderland (2010),Adventure|Fantasy|IMAX


# Prediction for an Existing User

In [123]:
user_train = duplicated_user_train.copy()
item_train = duplicated_item_train.copy()

In [124]:
user_train.loc[user_train.userId == 2].head(1)

Unnamed: 0,userId,movie_count,rating_ave,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
1,2,20,4.0,4.2,0.0,0.0,4.2,0.0,0.0,4.0,4.0,4.1,3.9,3.0,4.0,3.9,0.0


In [125]:
item_train.sort_values(by = "movieId", ascending = True, inplace = True)
item_train.drop_duplicates(inplace = True)

In [126]:
user_info = user_train.loc[user_train.userId == 2].head(1)

### duplicating

In [127]:
user_info_df = user_info.sample(item_train.shape[0], replace = True, random_state = 1)

In [128]:
print(user_info_df.shape)
print(item_train.shape)

(482, 17)
(482, 17)


In [129]:
user_info_df = user_info_df.to_numpy()
item_train = item_train.to_numpy()
item_vecs = item_train

In [130]:
suser_vecs = scalerUser.transform(user_info_df[:, u_s:])
sitem_vecs = scalerItem.transform(item_vecs[:, i_s:])

In [131]:
y_p = model.predict([suser_vecs, sitem_vecs])
y_pu = scalerTarget.inverse_transform(y_p)
sorted_index = np.argsort(-y_pu, axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display



In [132]:
pd_pred_df = pd.DataFrame(sorted_items, columns = item_train_columns)
pd_pred_df["preds"] = sorted_ypu
pd_pred_df.loc[:,["movieId","year","ave_rating","preds"]]
pd_pred_df = pd_pred_df.merge(df.loc[:, ["movieId","title","genres"]], on = "movieId", how = "inner")
pd_pred_df = pd_pred_df[["movieId","ave_rating","preds","title","genres"]]
pd_pred_df = pd_pred_df.drop_duplicates()
pd_pred_df.head(15)

Unnamed: 0,movieId,ave_rating,preds,title,genres
0,34405.0,3.9,4.206367,Serenity (2005),Action|Adventure|Sci-Fi
50,68358.0,3.9,4.189864,Star Trek (2009),Action|Adventure|Sci-Fi|IMAX
109,8636.0,3.8,4.171078,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
188,6502.0,4.0,4.159258,28 Days Later (2002),Action|Horror|Sci-Fi
246,59315.0,3.8,4.155405,Iron Man (2008),Action|Adventure|Sci-Fi
340,112852.0,4.1,4.154729,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi
399,49272.0,3.9,4.132594,Casino Royale (2006),Action|Adventure|Thriller
480,89745.0,3.9,4.127208,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
549,3793.0,3.7,4.111879,X-Men (2000),Action|Adventure|Sci-Fi
682,27773.0,4.1,4.088145,Old Boy (2003),Mystery|Thriller


In [133]:
user_train.loc[user_train.userId == 2].head(1)

Unnamed: 0,userId,movie_count,rating_ave,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,Documentary
1,2,20,4.0,4.2,0.0,0.0,4.2,0.0,0.0,4.0,4.0,4.1,3.9,3.0,4.0,3.9,0.0


In [134]:
#adventure, comedy, drama, action, crime, thriller, horror, mystery, sci fi

# finding similar movies

In [135]:
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """   
    a = np.array(a)
    b = np.array(b)
    
    d = np.sum((np.subtract(a,b))**2)
    
    return d

In [136]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 16)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                41376     
                                                                 
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 41,376
Trainable params: 41,376
Non-trainable params: 0
_________________________________________________________________


In [137]:
item_train = duplicated_item_train.copy()
item_train.sort_values(by = "movieId", ascending = True, inplace = True)
item_train.drop_duplicates(inplace = True)
item_vecs = item_train

In [138]:
scaled_item_vecs = scalerItem.transform(item_vecs.iloc[:, i_s:])
vms = model_m.predict(scaled_item_vecs)
print(f"size of all predicted movie feature vectors: {vms.shape}")





size of all predicted movie feature vectors: (482, 32)


In [139]:
vms.shape

(482, 32)

### select a movie

In [140]:
movie_choice = 54503

In [141]:
movie.loc[movie["movieId"] == movie_choice, :]

Unnamed: 0,movieId,title,genres,year
6535,54503,Superbad (2007),Comedy,2007


### first for loop creates an distance matrix (482*482). values represent the similarities between movies. small values mean movies are similar. 

### second loop creates a dictionary that stores most similar movies for each movie in a list. keys are the movies' indexes.

In [142]:
distance_matrix = np.zeros(482*482).reshape(482,482)

for i in range(vms.shape[0]):
    for j in range(vms.shape[0]):
        distance_matrix[i,j] = sq_dist(vms[i],vms[j])
        
np.fill_diagonal(distance_matrix, 999)

distance_dict = {}
for i in range(distance_matrix.shape[0]):
    distance_dict[i] = np.argsort(distance_matrix[i], axis=0).reshape(-1).tolist()
    
    
item_train = duplicated_item_train.copy()
item_train.sort_values(by = "movieId", ascending = True, inplace = True)
item_train.drop_duplicates(inplace = True)

In [143]:
movie_id_index_df = pd.DataFrame(item_train.movieId).reset_index()
movie_id_index_df.head()

Unnamed: 0,index,movieId
0,0,3273
1,1,3275
2,2,3285
3,3,3298
4,4,3300


### select the target movie's index

In [144]:
indexx = movie_id_index_df.loc[movie_id_index_df["movieId"] == movie_choice, "index"].values[0]

In [145]:
similar_df = movie_id_index_df.merge(movie, how="inner", on="movieId").loc[:,["index","movieId","title","genres"]]

In [146]:
similar_df.loc[similar_df["movieId"] == movie_choice , ["index","title"]]

Unnamed: 0,index,title
321,321,Superbad (2007)


In [147]:
similar_df.loc[similar_df["movieId"] == movie_choice , ["index"]].values[0]

array([321], dtype=int64)

In [148]:
similar_df.loc[similar_df["movieId"] == movie_choice , ["index"]]

distance_list = []
for i in range(similar_df.shape[0]):
    distan = distance_matrix[similar_df.loc[similar_df["movieId"] == movie_choice , ["index"]].values[0], similar_df.loc[i, ["index"]].values[0]]
    distance_list.append(distan)
    
similar_df["distances"] = distance_list

In [149]:
similar_df.iloc[distance_dict[indexx]].head(10)

Unnamed: 0,index,movieId,title,genres,distances
400,400,86833,Bridesmaids (2011),Comedy,[0.02654464915394783]
213,213,8641,Anchorman: The Legend of Ron Burgundy (2004),Comedy,[0.026757577434182167]
199,199,7451,Mean Girls (2004),Comedy,[0.026757579296827316]
221,221,8874,Shaun of the Dead (2004),Comedy|Horror,[0.03303453326225281]
336,336,58998,Forgetting Sarah Marshall (2008),Comedy|Romance,[0.04575501009821892]
475,475,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,[0.0522664412856102]
389,389,80549,Easy A (2010),Comedy|Romance,[0.05558941513299942]
421,421,93840,"Cabin in the Woods, The (2012)",Comedy|Horror|Sci-Fi|Thriller,[0.05697830766439438]
371,371,71535,Zombieland (2009),Action|Comedy|Horror,[0.06700518727302551]
290,290,46976,Stranger than Fiction (2006),Comedy|Drama|Fantasy|Romance,[0.06812727451324463]


In [206]:
#similar_df.iloc[distance_dict[indexx]].iloc[[similar_df.iloc[distance_dict[indexx]].shape[0]-1]]