In [1]:
import numpy 
import tensorflow as tf
from tensorflow import keras 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

2025-01-12 11:12:13.627049: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
movies_df = pd.read_csv('./ml-latest-small/movies.csv')
ratings_df = pd.read_csv('./ml-latest-small/ratings.csv')

In [12]:
# So, the features we have from this df are movie, titele, and genres. 
# we will want to hot encode the move genres and then match the movie based on genre to the user rating 
print(movies_df.head(2))


# Each user will have a movie rating for any movies they have rated. Our goal is to take the movie 
# rating's and identify what movies we might reccomend to this user. The goal is to effectivley rate each movie and then provide 
# the highest movie rating as the next reccomended N movies. 
print(ratings_df.head(2))



   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2    Jumanji (1995)                   Adventure|Children|Fantasy
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247


In [13]:
print(movies_df['genres'])

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object


In [14]:
# We need to encode the genrese into a one hot encoding for each movie 
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))



In [18]:
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_df['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

# Concatenate the one-hot encoded genres back to the movies dataframe
movies_matrix = pd.concat([movies_df, genre_df], axis=1)

In [19]:
# So, now we are left with a combined data frame where the columsn 
# are movieId, title, genres(list), and then a matrix of one hot encoded values for each genre for each film. We refer to this
# matrix as the movies_matrix
print(movies_matrix)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                                 genres  (no genres listed)  \
0     [Adventure, Animation, Children, Comedy, Fantasy]                   0   
1                        [Adventure, Children, Fantasy]                   0   
2                                     [Com

In [28]:
# At this stage, we want to compute the weighted genre matrix. The weighted genre matrix is the dot product 
# of the input user ratings and the movies matrix. 
print(ratings_df.head(1))
print(ratings_df.shape)

# the timestamp of the user rating does not offer any insightful information for this particular problem, so I am going to drop the col. 
ratings_df = ratings_df.drop('timestamp', axis=1)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
(100836, 4)


In [29]:
print(ratings_df.head(1))

   userId  movieId  rating
0       1        1     4.0


In [32]:
# Not we are ready to create the weighted genre matrix. This matrix is created by multiplying the user input ratings with the movies matrix. 
# however, before we can compute the dot product, we will want to simplify the movies_matrix. 
print(ratings_df.head(1))
# and 
print(movies_matrix.head(1))
reduced_movies_matrix = movies_matrix.drop(['genres', 'title'], axis=1)


   userId  movieId  rating
0       1        1     4.0
   movieId             title  \
0        1  Toy Story (1995)   

                                              genres  (no genres listed)  \
0  [Adventure, Animation, Children, Comedy, Fantasy]                   0   

   Action  Adventure  Animation  Children  Comedy  Crime  ...  Film-Noir  \
0       0          1          1         1       1      0  ...          0   

   Horror  IMAX  Musical  Mystery  Romance  Sci-Fi  Thriller  War  Western  
0       0     0        0        0        0       0         0    0        0  

[1 rows x 43 columns]


In [36]:
# we can see from this matrix that movie 1 belongs to some of the 41 different catagories that exist. 
# we can now use this reduced matrix to computer the weighted movie matrix. 
print(reduced_movies_matrix.head(1))

   movieId  (no genres listed)  Action  Adventure  Animation  Children  \
0        1                   0       0          1          1         1   

   Comedy  Crime  Documentary  Drama  ...  Film-Noir  Horror  IMAX  Musical  \
0       1      0            0      0  ...          0       0     0        0   

   Mystery  Romance  Sci-Fi  Thriller  War  Western  
0        0        0       0         0    0        0  

[1 rows x 41 columns]


In [38]:
print(ratings_df[ratings_df["movieId"] == 1])

       userId  movieId  rating
0           1        1     4.0
516         5        1     4.0
874         7        1     4.5
1434       15        1     2.5
1667       17        1     4.5
...       ...      ...     ...
97364     606        1     2.5
98479     607        1     4.0
98666     608        1     2.5
99497     609        1     3.0
99534     610        1     5.0

[215 rows x 3 columns]


In [48]:
# To better understand the model. We are going to do the prediction for one user. You can note that 
# if we look at the movies that user 1 had rated, then we get about 232 movies. We want to take these rated movies 
# and multiply them by the movies matrix which represents the classes.
user_one_matrix_rated = ratings_df[ratings_df["userId"] == 1]
user_one_matrix_rated = user_one_matrix_rated[user_one_matrix_rated["rating"]> 0]

user_one_matrix_rated

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
227,1,3744,4.0
228,1,3793,5.0
229,1,3809,4.0
230,1,4006,4.0


In [51]:
# To do so, we need to only have the movies matrix that user one has rated. So, we need to pull out of the movies_matrix 
# the movies that user one has rated. 
user_one_matrix_rated_moviesId = user_one_matrix_rated['movieId']

# this list represents the rated movies by user 1. 
user_one_matrix_rated_moviesId

0         1
1         3
2         6
3        47
4        50
       ... 
227    3744
228    3793
229    3809
230    4006
231    5060
Name: movieId, Length: 232, dtype: int64

In [55]:
# Reduce movie matrix to only the movies rated by user 1 
user_one_reduced_movies_matrix = reduced_movies_matrix.loc[user_one_matrix_rated_moviesId]
user_one_reduced_movies_matrix

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
6,7,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
47,52,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
50,55,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3744,5214,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3793,5304,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3809,5333,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4006,5663,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [70]:
# now we have two matricies that we can compter the dot product for. Computer the dot product for user 1 
print(user_one_matrix_rated.shape)
print(user_one_reduced_movies_matrix.shape)
user_one_weight_movie_matrix = numpy.dot(user_one_matrix_rated['rating'], user_one_reduced_movies_matrix.drop('movieId', axis=1))

(232, 3)
(232, 41)


In [62]:
# this weighted matrix represents the weighted genre matrix and represents the interests of user 1 based on 
# what theyy have watched. 
user_one_weight_movie_matrix
user_one_weight_movie_matrix.shape

(40,)

In [63]:
# the above matrix represents a 40X1 matrix where the 1 is user1 and the 40 is the users preference per genre 
user_one_weight_movie_matrix

array([  0., 159., 110.,  44.,  78., 284., 116.,  25., 470.,  56.,  14.,
       130.,   0.,  34.,  67., 177.,  73., 211.,  44.,   6.,   0., 159.,
       110.,  44.,  78., 284., 116.,  25., 470.,  56.,  14., 130.,   0.,
        34.,  67., 177.,  73., 211.,  44.,   6.])

In [65]:
# now, we can take our reduced movie matrix, and multiply it by our candidate movie matrix. In this case, this matrix might be a reduced subset 
# of the movies matrix. 
reduced_movies_matrix.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
(no genres listed),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Action,0,0,0,0,0,1,0,0,1,1,...,1,0,0,0,0,1,0,0,1,0
Adventure,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
Animation,1,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,1,1,0,1,0
Children,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Comedy,1,0,1,1,1,0,1,0,0,0,...,1,0,1,0,0,1,1,0,0,1
Crime,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Documentary,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Drama,0,0,0,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0


In [79]:
temp_reduced_movies_matrix = reduced_movies_matrix.drop('movieId', axis=1).T
user_one_all_movie_ratings = numpy.dot(user_one_weight_movie_matrix, temp_reduced_movies_matrix)

In [89]:
# this array now represents the rating of all 9742 movies for user one. We can now get the top 10 movies reccomendations. 
user_one_all_movie_ratings


AttributeError: 'numpy.ndarray' object has no attribute 'Len'

In [77]:
user_one_top_10_ratings = numpy.sort(user_one_all_movie_ratings)[-10:][::-1] 

In [None]:
print