In [1]:
import numpy as np
import pandas as pd
import os, time, re
import pickle, gzip, datetime

from datetime import datetime

now = datetime.now()

from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, mean_squared_error
# import lightgbm as lgb


import tensorflow as tf
import keras

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, Input, Lambda
from tensorflow.keras.layers import Embedding, Flatten, dot
from tensorflow.keras import regularizers
from tensorflow.keras.losses import mse, binary_crossentropy



In [2]:
ratingDF = pd.read_csv('files/csv/ratings.csv')

In [3]:
ratingDF.userId = ratingDF.userId.astype(str).astype(int)
ratingDF.movieId = ratingDF.movieId.astype(str).astype(int)
ratingDF.rating = ratingDF.rating.astype(str).astype(float)
ratingDF.timestamp = ratingDF.timestamp.apply(lambda x: now.strftime("%m/%d/%Y, %H:%M:%S"))

In [4]:
n_users = ratingDF.userId.unique().shape[0]
n_movies = ratingDF.movieId.unique().shape[0]
n_ratings = len(ratingDF)
avg_ratings_per_user = n_ratings/n_users


print('Number of unique users: ', n_users)
print('Number of unique movies: ', n_movies)
print('Number of total ratings: ', n_ratings)
print('Average number of ratings per user: ', avg_ratings_per_user)

Number of unique users:  138493
Number of unique movies:  26744
Number of total ratings:  20000263
Average number of ratings per user:  144.4135299257002


In [5]:
movieIndex = ratingDF.groupby("movieId").count().sort_values(by= \
"rating",ascending=False)[0:1000].index
ratingDFX2 = ratingDF[ratingDF.movieId.isin(movieIndex)]
ratingDFX2.count()

userId       12840344
movieId      12840344
rating       12840344
timestamp    12840344
dtype: int64

In [6]:
ratingDFX2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,"06/15/2024, 12:42:07"
1,1,29,3.5,"06/15/2024, 12:42:07"
2,1,32,3.5,"06/15/2024, 12:42:07"
3,1,47,3.5,"06/15/2024, 12:42:07"
4,1,50,3.5,"06/15/2024, 12:42:07"


In [7]:
userIndex = ratingDFX2.groupby("userId").count().sort_values(by= \
"rating",ascending=False).sample(n=1000, random_state=2018).index
ratingDFX3 = ratingDFX2[ratingDFX2.userId.isin(userIndex)]
ratingDFX3.count()

userId       90213
movieId      90213
rating       90213
timestamp    90213
dtype: int64

In [8]:
ratingDFX3.head()

Unnamed: 0,userId,movieId,rating,timestamp
4943,49,50,5.0,"06/15/2024, 12:42:07"
4944,49,163,3.5,"06/15/2024, 12:42:07"
4945,49,216,3.0,"06/15/2024, 12:42:07"
4946,49,296,5.0,"06/15/2024, 12:42:07"
4947,49,333,3.0,"06/15/2024, 12:42:07"


In [9]:
movies = ratingDFX3.movieId.unique()
moviesDF = pd.DataFrame(data=movies,columns=['originalMovieId'])
moviesDF['newMovieId'] = moviesDF.index+1

In [10]:
moviesDF.head()

Unnamed: 0,originalMovieId,newMovieId
0,50,1
1,163,2
2,216,3
3,296,4
4,333,5


In [11]:
users = ratingDFX3.userId.unique()
usersDF = pd.DataFrame(data=users,columns=['originalUserId'])
usersDF['newUserId'] = usersDF.index+1

In [12]:
usersDF.head()

Unnamed: 0,originalUserId,newUserId
0,49,1
1,260,2
2,311,3
3,319,4
4,499,5


In [13]:
ratingDFX3 = ratingDFX3.merge(moviesDF,left_on='movieId', \
right_on='originalMovieId')
ratingDFX3.drop(labels='originalMovieId', axis=1, inplace=True)
ratingDFX3 = ratingDFX3.merge(usersDF,left_on='userId', \
right_on='originalUserId')
ratingDFX3.drop(labels='originalUserId', axis=1, inplace=True)

In [14]:
ratingDFX3.head()

Unnamed: 0,userId,movieId,rating,timestamp,newMovieId,newUserId
0,49,50,5.0,"06/15/2024, 12:42:07",1,1
1,49,163,3.5,"06/15/2024, 12:42:07",2,1
2,49,216,3.0,"06/15/2024, 12:42:07",3,1
3,49,296,5.0,"06/15/2024, 12:42:07",4,1
4,49,333,3.0,"06/15/2024, 12:42:07",5,1


In [15]:
X_train, X_test = train_test_split(ratingDFX3.head(100),
test_size=0.10, shuffle=True, random_state=2018)
X_validation, X_test = train_test_split(X_test,
test_size=0.50, shuffle=True, random_state=2018)

In [16]:
X_train

Unnamed: 0,userId,movieId,rating,timestamp,newMovieId,newUserId
78,260,68954,4.5,"06/15/2024, 12:42:07",73,2
37,260,296,3.0,"06/15/2024, 12:42:07",4,2
5,49,475,4.5,"06/15/2024, 12:42:07",6,1
62,260,2329,4.0,"06/15/2024, 12:42:07",59,2
73,260,48774,5.0,"06/15/2024, 12:42:07",68,2
...,...,...,...,...,...,...
20,49,2541,3.5,"06/15/2024, 12:42:07",21,1
28,49,79132,4.5,"06/15/2024, 12:42:07",29,1
21,49,2571,4.0,"06/15/2024, 12:42:07",22,1
9,49,785,3.5,"06/15/2024, 12:42:07",10,1


In [17]:
print('Shape of train set:', X_train.shape)
print('Shape of validation set:',X_validation.shape)
print('Shape of test set: ',X_test.shape)

Shape of train set: (90, 6)
Shape of validation set: (5, 6)
Shape of test set:  (5, 6)


In [18]:
# Generate ratings matrix for train

#for now
n_users = 100
n_movies = 100

ratings_train = np.zeros((n_users, n_movies))
for row in X_train.itertuples():
    ratings_train[row[6]-1, row[5]-1] = row[3]

In [19]:
ratings_train

array([[5. , 3.5, 0. , ..., 0. , 0. , 0. ],
       [4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 3. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [20]:
ratings_train.shape

(100, 100)

In [21]:
sparsity = float(len(ratings_train.nonzero()[0]))
sparsity /= (ratings_train.shape[0] * ratings_train.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 0.90%


In [22]:
# Generate ratings matrix for validation
ratings_validation = np.zeros((n_users, n_movies))
for row in X_validation.itertuples():
    ratings_validation[row[6]-1, row[5]-1] = row[3]

In [23]:
actual_validation = ratings_validation[ratings_validation.nonzero()].flatten()

In [24]:
actual_validation

array([4., 4., 5., 3., 1.])

In [25]:
pred_validation = np.zeros((len(X_validation),1))
pred_validation[pred_validation==0] = 3.5
pred_validation

array([[3.5],
       [3.5],
       [3.5],
       [3.5],
       [3.5]])

In [26]:
print("Mean  Squared Error is : ", mean_squared_error(pred_validation, actual_validation))

Mean  Squared Error is :  1.85


In [27]:
ratings_validation_prediction = np.zeros((n_users, n_movies))
i = 0
for row in ratings_train:
    ratings_validation_prediction[i][ratings_validation_prediction[i]==0] = np.mean(row[row>0])
    i += 1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [28]:
pred_validation = ratings_validation_prediction  [ratings_validation.nonzero()].flatten()
user_average = mean_squared_error(pred_validation, actual_validation)
print('Mean squared error using user average:', user_average)

Mean squared error using user average: 1.72925646902548


In [29]:
ratings_validation_prediction = np.zeros((n_users, n_movies)).T
i = 0
for row in ratings_train.T:
    ratings_validation_prediction[i][ratings_validation_prediction[i]==0] = np.mean(row[row>0])
    i += 1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [30]:
ratings_validation_prediction = ratings_validation_prediction.T
pred_validation = ratings_validation_prediction[ratings_validation.nonzero()].flatten()
movie_average = mean_squared_error(pred_validation, actual_validation)
print('Mean squared error using movie average:', movie_average)

ValueError: Input contains NaN.

In [32]:
n_latent_factors = 1
user_input = Input(shape=[1], name='user')
user_embedding = Embedding(input_dim=n_users + 1, output_dim=n_latent_factors,
name='user_embedding')(user_input)
user_vec = Flatten(name='flatten_users')(user_embedding)
movie_input = Input(shape=[1], name='movie')
movie_embedding = Embedding(input_dim=n_movies + 1,
output_dim=n_latent_factors,
name='movie_embedding')(movie_input)
movie_vec = Flatten(name='flatten_movies')(movie_embedding)
product = dot([movie_vec, user_vec], axes=1)
model = Model(inputs=[user_input, movie_input], outputs=product)
model.compile('adam', 'mean_squared_error')

In [33]:
history = model.fit(x=[X_train.newUserId, X_train.newMovieId],
                    y=X_train.rating, epochs=1,
                    validation_data=([X_validation.newUserId,
                                      X_validation.newMovieId], X_validation.rating),
                    verbose=-1)

In [None]:
pd.Series(history.history['val_loss'][10:]).plot(logy=False)
plt.xlabel("Epoch")
plt.ylabel("Validation Error")
print('Minimum MSE: ', min(history.history['val_loss']))