In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [16]:
ratings_data = pd.read_csv('./podaci/rating.csv', parse_dates=['timestamp'])

In [17]:
filtered_ratings_df = ratings_data[ratings_data['rating'] > 3]
print(filtered_ratings_df.shape)

(12195566, 4)


In [18]:
rand_userIds = np.random.choice(filtered_ratings_df['userId'].unique(), 
                                size=int(len(filtered_ratings_df['userId'].unique())*0.3), 
                                replace=False)

ratings = filtered_ratings_df.loc[filtered_ratings_df['userId'].isin(rand_userIds)]

print('Broj recenzija za treniranje: {} kreiranih od strane {} korisnika'.format(len(ratings), len(rand_userIds)))
ratings.sample(5)

Broj recenzija za treniranje: 3678382 kreiranih od strane 41508 korisnika


Unnamed: 0,userId,movieId,rating,timestamp
2065358,13976,1333,4.0,2001-05-14 14:29:50
5504575,37847,81932,5.0,2011-02-23 17:25:49
16664415,115277,434,4.0,1996-05-08 22:17:00
19884412,137677,2826,4.0,2001-07-28 02:38:29
16757000,115918,520,4.0,2010-05-15 18:07:48


In [209]:
# user_review_counts = filtered_ratings_df['userId'].value_counts()
# movie_review_counts = filtered_ratings_df['movieId'].value_counts()

# selected_users = user_review_counts[user_review_counts > 70].index
# selected_movies = movie_review_counts[movie_review_counts > 200].index

# filtered_ratings_df = filtered_ratings_df[(filtered_ratings_df['userId'].isin(selected_users)) & (filtered_ratings_df['movieId'].isin(selected_movies))]

In [19]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
ratings.loc[:, 'userId'] = user_encoder.fit_transform(ratings['userId'])
ratings.loc[:, 'movieId'] = movie_encoder.fit_transform(ratings['movieId'])
print(ratings.shape)
print(ratings.head())

(3678382, 4)
     userId  movieId  rating           timestamp
236       0        0     4.0 1999-12-11 13:36:47
238       0       31     4.0 1999-12-11 13:14:07
239       0       49     5.0 1999-12-11 13:13:38
242       0      173     5.0 1999-12-11 13:32:13
244       0      220     5.0 1999-12-11 13:20:44


In [20]:
#filtered_ratings = ratings.dropna(inplace=True)
#filtered_ratings = ratings.reset_index(drop=True)
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [21]:
user_input = tf.keras.layers.Input(shape=(1,))
user_embedding = tf.keras.layers.Embedding(input_dim=len(user_encoder.classes_), output_dim=50, input_length=1)(user_input)
user_flatten = tf.keras.layers.Flatten()(user_embedding)

movie_input = tf.keras.layers.Input(shape=(1,))
movie_embedding = tf.keras.layers.Embedding(input_dim=len(movie_encoder.classes_), output_dim=50, input_length=1)(movie_input)
movie_flatten = tf.keras.layers.Flatten()(movie_embedding)

concatenated = tf.keras.layers.Concatenate()([user_flatten, movie_flatten])

dense_layer = tf.keras.layers.Dense(128, activation='relu')(concatenated)
output_layer = tf.keras.layers.Dense(1, activation='linear')(dense_layer)

In [22]:
model = tf.keras.Model(inputs=[user_input, movie_input], outputs=output_layer)

In [23]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16066817329620481170
xla_global_id: -1
]


In [215]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_33 (InputLayer)       [(None, 1)]                  0         []                            
                                                                                                  
 input_34 (InputLayer)       [(None, 1)]                  0         []                            
                                                                                                  
 embedding_34 (Embedding)    (None, 1, 50)                2075400   ['input_33[0][0]']            
                                                                                                  
 embedding_35 (Embedding)    (None, 1, 50)                938300    ['input_34[0][0]']            
                                                                                           

In [24]:
train_user_data = train_data['userId'].values
train_movie_data = train_data['movieId'].values
train_ratings_data = train_data['rating'].values

test_user_data = test_data['userId'].values
test_movie_data = test_data['movieId'].values
test_ratings_data = test_data['rating'].values

In [217]:
model.fit([train_user_data, train_movie_data], train_ratings_data, epochs=10, batch_size=64, validation_data=([test_user_data, test_movie_data], test_ratings_data))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x21c56cca4a0>

In [219]:
model.save('./modeli/model_1.h5')
model.save('./modeli/model_1.keras')

  saving_api.save_model(


In [25]:
loaded_model = tf.keras.models.load_model('./modeli/model_1.keras')

In [26]:
predictions = model.predict([test_data['userId'], test_data['movieId']])



In [27]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test_data['rating'], predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 18.224523585134335
