In [2]:
pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1


In [6]:
import numpy as np
import pandas as pd
from keras.layers import Embedding, Dot, Reshape, Dense, Input
from keras.models import Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
from sklearn.model_selection import train_test_split

movies = pd.read_csv('movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python', encoding='ISO-8859-1')
users = pd.read_csv('users.dat', sep='::', names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python', encoding='ISO-8859-1')

movie_mapping = {movie_id: index for index, movie_id in enumerate(movies['movie_id'])}
user_mapping = {user_id: index for index, user_id in enumerate(users['user_id'])}

ratings['user_index'] = ratings['user_id'].map(user_mapping)
ratings['movie_index'] = ratings['movie_id'].map(movie_mapping)

if ratings['user_index'].isna().any() or ratings['movie_index'].isna().any():
    print("Error: Some user_ids or movie_ids in the ratings data do not exist in the user or movie datasets.")
else:
    print("Mappings are consistent.")

embedding_size = 8
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

user_embedding = Embedding(input_dim=len(user_mapping), output_dim=embedding_size, input_length=1, name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=len(movie_mapping), output_dim=embedding_size, input_length=1, name='movie_embedding')(movie_input)

user_vector = Reshape((embedding_size,))(user_embedding)
movie_vector = Reshape((embedding_size,))(movie_embedding)

dot_product = Dot(axes=1)([user_vector, movie_vector])

output = Dense(1, activation='sigmoid', name='output')(dot_product)

model = Model(inputs=[user_input, movie_input], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy')

user_indices = ratings['user_index'].values
movie_indices = ratings['movie_index'].values
ratings_labels = (ratings['rating'] >= 4).astype(int).values

model.fit(
    x=[user_indices, movie_indices],
    y=ratings_labels,
    epochs=10,
    batch_size=256,
    verbose=1
)

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

test_user_indices = test['user_index'].values
test_movie_indices = test['movie_index'].values
test_labels = test['rating'].values

predictions = model.predict([test_user_indices, test_movie_indices]).flatten()

# Convert predictions to binary (0 or 1) labels for classification metrics
predicted_labels = (predictions >= 0.5).astype(int)

accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted', zero_division=0)
recall = recall_score(test_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(test_labels, predicted_labels, average='weighted', zero_division=0)
rmse = np.sqrt(mean_squared_error(test_labels, predictions))

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")





Mappings are consistent.
Epoch 1/10




[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.6578
Epoch 2/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.5248
Epoch 3/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.5082
Epoch 4/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.4936
Epoch 5/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.4826
Epoch 6/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.4737
Epoch 7/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.4683
Epoch 8/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.4647
Epoch 9/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.4614
Epoch 10/10
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1m