In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow import keras

### Read & clean the movie dataset

In [2]:
movies_df = pd.read_csv('./data/movielens_small/movies.csv')

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [4]:
movies_df.set_index('movieId', inplace=True)

In [5]:
sub_movies_df = movies_df.title.str.extract(r'([^\(\)]+) ?(\((\d{4})\))?')

print(sub_movies_df.loc[5])
print(sub_movies_df.loc[171495])

movies_df['title'] = sub_movies_df[0]
movies_df['year'] = sub_movies_df[2].astype("Int32")

movies_df.head()

0    Father of the Bride Part II 
1                          (1995)
2                            1995
Name: 5, dtype: object
0    Cosmos
1       NaN
2       NaN
Name: 171495, dtype: object


Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji,Adventure|Children|Fantasy,1995
3,Grumpier Old Men,Comedy|Romance,1995
4,Waiting to Exhale,Comedy|Drama|Romance,1995
5,Father of the Bride Part II,Comedy,1995


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9742 non-null   object
 1   genres  9742 non-null   object
 2   year    8656 non-null   Int32 
dtypes: Int32(1), object(2)
memory usage: 533.9+ KB


In [7]:
# remove movies that don't have a year

no_year_movies = movies_df[movies_df.year.isna()].index
movies_df.dropna(inplace=True)

In [8]:
movie_genres = movies_df.genres.str.split('|')
genres = movie_genres.explode().unique()

genres

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [9]:
invalid_genres = ['IMAX', '(no genres listed)']
genres = [genre for genre in genres if genre not in invalid_genres]

genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'War',
 'Musical',
 'Documentary',
 'Western',
 'Film-Noir']

In [10]:
# remove movies that don't have any genre

no_genre = movie_genres.apply(lambda x: '(no genres listed)' in x)
no_genre_movies = movie_genres[no_genre].index

movies_df = movies_df[~no_genre]
movie_genres = movie_genres[~no_genre]

movies_df

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji,Adventure|Children|Fantasy,1995
3,Grumpier Old Men,Comedy|Romance,1995
4,Waiting to Exhale,Comedy|Drama|Romance,1995
5,Father of the Bride Part II,Comedy,1995
...,...,...,...
193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
193585,Flint,Drama,2017
193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


### Read and clean the ratings dataset

In [11]:
ratings = pd.read_csv('./data/movielens_small/ratings.csv')

ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
ratings.drop(columns=['timestamp'], inplace=True)

In [13]:
# remove the ratings for movies with no year/genre

removed_movies = set(no_year_movies.tolist()) | set(no_genre_movies.tolist())

ratings = ratings[ratings.movieId.apply(lambda movieId: movieId not in removed_movies)]

ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
4,1,50,5.0
5,1,70,3.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


### Create features for the movie dataset

In [14]:
len(movies_df)

8631

In [15]:
# Remove movies that no user has rated

rated_movies = ratings.movieId.unique()
movies_df = movies_df.loc[rated_movies]

len(movies_df)

8615

In [16]:
genre_df = pd.DataFrame(np.zeros((len(movies_df), len(genres))), columns=genres, index=movies_df.index)
genre_df.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
for i, movie_genre in enumerate(movie_genres.loc[rated_movies]):
    for genre in movie_genre:
        if genre not in invalid_genres:
            genre_df.iloc[i][genre] = 1

In [18]:
genre_df.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
70,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
genre_df['avgRating'] = ratings.groupby('movieId')['rating'].mean().round(1)

genre_df

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,Western,Film-Noir,avgRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.2
70,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
160527,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
160836,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
163937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


### Create the movie items dataset

In [20]:
movies = ratings.copy().join(genre_df, on='movieId', how='inner')
movies.head()

Unnamed: 0,userId,movieId,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,Western,Film-Noir,avgRating
0,1,1,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
516,5,1,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
874,7,1,4.5,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
1434,15,1,2.5,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
1667,17,1,4.5,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9


In [21]:
movies.drop(columns=['userId', 'rating'], inplace=True)
movies.sort_index(inplace=True)

movies

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,Western,Film-Noir,avgRating
0,1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
1,3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
4,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.2
5,70,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,166534,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3
100832,168248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1
100833,168250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.6
100834,168252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.3


### Create features for the users

In [22]:
user_df = ratings.copy()
user_df['genres'] = ratings.movieId.apply(lambda movieId: movie_genres.loc[movieId])
user_df.drop(columns=['movieId'], inplace=True)

user_df.head()

Unnamed: 0,userId,rating,genres
0,1,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,4.0,"[Comedy, Romance]"
2,1,4.0,"[Action, Crime, Thriller]"
4,1,5.0,"[Crime, Mystery, Thriller]"
5,1,3.0,"[Action, Comedy, Horror, Thriller]"


In [23]:
user_df = user_df.explode('genres').groupby(['userId', 'genres']).mean().round(1).reset_index()

user_df

Unnamed: 0,userId,genres,rating
0,1,Action,4.3
1,1,Adventure,4.4
2,1,Animation,4.7
3,1,Children,4.5
4,1,Comedy,4.3
...,...,...,...
9937,610,Romance,3.7
9938,610,Sci-Fi,3.7
9939,610,Thriller,3.5
9940,610,War,3.8


In [24]:
user_avg_ratings = user_df.pivot(index='userId', columns='genres', values='rating').fillna(0)
user_avg_ratings

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,3.8,0.0,4.0,4.5,3.9,3.7,4.5,3.5
3,3.2,2.5,0.5,0.5,1.0,0.5,0.0,0.8,3.4,0.0,4.6,0.0,0.5,0.0,0.5,4.1,4.0,0.5,0.0
4,3.3,3.7,4.0,3.8,3.5,3.8,4.0,3.5,3.7,4.0,4.3,3.0,4.0,3.6,3.4,2.9,3.6,4.0,3.8
5,3.1,3.2,4.3,4.1,3.3,3.8,0.0,3.8,4.3,0.0,3.0,3.7,4.4,4.0,2.8,2.5,3.6,3.3,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.1,3.4,3.6,3.4,3.5,3.6,3.8,3.7,3.5,3.8,3.3,3.1,3.7,3.8,3.7,3.6,3.5,3.7,3.2
607,3.8,3.5,3.3,3.4,3.4,3.9,0.0,4.0,3.5,0.0,4.1,5.0,3.6,4.6,3.5,3.2,4.1,4.4,4.0
608,3.3,3.2,3.1,2.4,2.7,3.6,3.0,3.4,3.0,3.8,3.3,4.0,2.8,3.6,2.9,3.3,3.5,3.6,2.6
609,3.1,3.2,3.0,3.0,3.3,3.5,3.0,3.3,3.0,0.0,3.5,3.0,0.0,0.0,3.2,3.0,3.3,3.3,4.0


### Create the user items dataset

In [25]:
users = ratings.copy().join(user_avg_ratings, on='userId', how='inner')

users.head()

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,4.3,4.4,4.7,4.5,4.3,4.3,0.0,...,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
1,1,3,4.0,4.3,4.4,4.7,4.5,4.3,4.3,0.0,...,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
2,1,6,4.0,4.3,4.4,4.7,4.5,4.3,4.3,0.0,...,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
4,1,50,5.0,4.3,4.4,4.7,4.5,4.3,4.3,0.0,...,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
5,1,70,3.0,4.3,4.4,4.7,4.5,4.3,4.3,0.0,...,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3


In [26]:
users.drop(columns=['movieId', 'rating'], inplace=True)
users.sort_index(inplace=True)

users

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
1,1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
2,1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
4,1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
5,1,4.3,4.4,4.7,4.5,4.3,4.3,0.0,4.5,4.3,5.0,3.4,0.0,4.7,4.1,4.3,4.2,4.1,4.5,4.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,3.5,3.7,3.9,3.7,3.7,3.7,4.2,3.8,3.6,4.4,3.5,3.6,3.9,3.7,3.7,3.7,3.5,3.8,3.6
100832,610,3.5,3.7,3.9,3.7,3.7,3.7,4.2,3.8,3.6,4.4,3.5,3.6,3.9,3.7,3.7,3.7,3.5,3.8,3.6
100833,610,3.5,3.7,3.9,3.7,3.7,3.7,4.2,3.8,3.6,4.4,3.5,3.6,3.9,3.7,3.7,3.7,3.5,3.8,3.6
100834,610,3.5,3.7,3.9,3.7,3.7,3.7,4.2,3.8,3.6,4.4,3.5,3.6,3.9,3.7,3.7,3.7,3.5,3.8,3.6


### Prepare the input

In [27]:
# Y[nm * nu, 1]
# nm - number of movies
# nu - number of users
# Y[i][j] - the rating (from .5 to 5 in .5 increments)

Y = ratings['rating'].values.reshape(-1, 1)
print(f'Y shape: {Y.shape}')

# U[nm * nu, u_f] - user feature matrix
# u_f - number of user features

U = users.values[:, 1:]

print(f'U shape: {U.shape}')

# M[nm * nu, m_f] - movie feature matrix
# m_f - number of movie features

M = movies.values[:, 1:]

print(f'M shape: {M.shape}')

Y shape: (94001, 1)
U shape: (94001, 19)
M shape: (94001, 19)


In [28]:
user_scaler = StandardScaler()
movie_scaler = StandardScaler()
rating_scaler = MinMaxScaler((-1, 1))

U = user_scaler.fit_transform(U)
M = movie_scaler.fit_transform(M)
Y = rating_scaler.fit_transform(Y)

In [42]:
user_train, user_val, movie_train, movie_val, rating_train, rating_val = train_test_split(U, M, Y, test_size=.2, shuffle=True, random_state=1)

print(user_train.shape)
print(user_val.shape)

(75200, 19)
(18801, 19)


### Create the network

In [43]:
user_input = keras.layers.Input(shape=(U.shape[1], ))
movie_input = keras.layers.Input(shape=(M.shape[1], ))

user_network = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(32),
])

movie_network = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(32),
])

vu = user_network(user_input)
vu = tf.linalg.l2_normalize(vu, axis=1)

vm = movie_network(movie_input)
vm = tf.linalg.l2_normalize(vm, axis=1)

model_out = tf.matmul(vm, vu, transpose_b=True)

model = keras.Model([user_input, movie_input], model_out)

In [44]:
cost_fn = keras.losses.MeanSquaredError()

optimizer = keras.optimizers.Adam(learning_rate=1e-1)

model.compile(optimizer=optimizer, loss=cost_fn)

In [45]:
model.fit([user_train, movie_train], rating_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x15691826e90>

In [46]:
model.evaluate([user_val, movie_val], rating_val)



0.15602533519268036