In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers

In [2]:
ratings = pd.read_csv('/Users/arun/Downloads/ml-latest-small/ratings.csv')
movies = pd.read_csv('/Users/arun/Downloads/ml-latest-small/movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)

In [6]:
movies['genres'] = movies['genres'].str.split('|')

In [7]:
genres_list = sorted(set(genre for sublist in movies['genres'] for genre in sublist))
for genre in genres_list:
    movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

In [8]:
movie_features = ratings.groupby('movieId').agg({'rating': ['mean', 'count']}).reset_index()
movie_features.columns = ['movieId', 'avg_rating', 'num_ratings']

In [9]:
movie_features = movie_features.merge(movies[['movieId', 'year'] + genres_list], on='movieId', how='left')

In [10]:
movie_features.fillna(0, inplace=True)

In [11]:
user_features = ratings.merge(movies[['movieId'] + genres_list], on='movieId')

In [12]:
user_features = user_features.groupby(['userId'] + genres_list).agg({'rating': 'mean'}).reset_index()

In [13]:
user_features = user_features.pivot(index='userId', columns=genres_list, values='rating').fillna(0).reset_index()

In [14]:
user_features.fillna(0, inplace=True)

In [15]:
user_features['avg_user_rating'] = ratings.groupby('userId')['rating'].mean().fillna(0).values

In [16]:
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

In [17]:
unique_user_ids = {uid: idx for idx, uid in enumerate(user_ids)}
unique_movie_ids = {mid: idx for idx, mid in enumerate(movie_ids)}

In [18]:
scaler_user = MinMaxScaler()
user_features_scaled = scaler_user.fit_transform(user_features.drop(columns=['userId']))

  user_features_scaled = scaler_user.fit_transform(user_features.drop(columns=['userId']))


In [19]:
scaler_movie = MinMaxScaler()
movie_features_scaled = scaler_movie.fit_transform(movie_features.drop(columns=['movieId']))

In [20]:
X_user = np.array([user_features_scaled[unique_user_ids[uid]] for uid in ratings['userId']])
X_movie = np.array([movie_features_scaled[unique_movie_ids[mid]] for mid in ratings['movieId']])
y = ratings['rating'].values

In [21]:
y = np.clip(y, 0, 5)

In [22]:
user_id_train = np.array([unique_user_ids[uid] for uid in ratings['userId']])
movie_id_train = np.array([unique_movie_ids[mid] for mid in ratings['movieId']])

In [23]:
X_user_train, X_user_test, X_movie_train, X_movie_test, y_train, y_test, user_id_train, user_id_test, movie_id_train, movie_id_test = train_test_split(
    X_user, X_movie, y, user_id_train, movie_id_train, test_size=0.2, random_state=42)

In [24]:
user_input = layers.Input(shape=(X_user_train.shape[1],))
movie_input = layers.Input(shape=(X_movie_train.shape[1],))
user_id_input = layers.Input(shape=(1,))
movie_id_input = layers.Input(shape=(1,))

In [25]:
user_embedding = layers.Embedding(input_dim=len(unique_user_ids), output_dim=16)(user_id_input)
movie_embedding = layers.Embedding(input_dim=len(unique_movie_ids), output_dim=16)(movie_id_input)
user_embedding = layers.Flatten()(user_embedding)
movie_embedding = layers.Flatten()(movie_embedding)

In [26]:
user_dense = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(user_input)
user_dense = layers.BatchNormalization()(user_dense)
user_dense = layers.Dropout(0.2)(user_dense)
user_dense = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001))(user_dense)

In [27]:
movie_dense = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(movie_input)
movie_dense = layers.BatchNormalization()(movie_dense)
movie_dense = layers.Dropout(0.2)(movie_dense)
movie_dense = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001))(movie_dense)

In [28]:
dot_product = layers.Dot(axes=1)([user_dense, movie_dense])

In [29]:
model = keras.Model(inputs=[user_input, movie_input, user_id_input, movie_id_input], outputs=dot_product)

In [30]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

In [31]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.00001)

In [32]:
history = model.fit(
    [X_user_train, X_movie_train, user_id_train, movie_id_train],
    y_train,
    epochs=20,
    batch_size=32,
    validation_data=([X_user_test, X_movie_test, user_id_test, movie_id_test], y_test),
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 9.6761 - val_loss: 3.1614 - learning_rate: 1.0000e-04
Epoch 2/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 971us/step - loss: 2.9327 - val_loss: 2.4907 - learning_rate: 1.0000e-04
Epoch 3/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 2.3985 - val_loss: 2.0814 - learning_rate: 1.0000e-04
Epoch 4/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 928us/step - loss: 2.0431 - val_loss: 1.8040 - learning_rate: 1.0000e-04
Epoch 5/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 980us/step - loss: 1.7834 - val_loss: 1.6847 - learning_rate: 1.0000e-04
Epoch 6/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 902us/step - loss: 1.5968 - val_loss: 1.4873 - learning_rate: 1.0000e-04
Epoch 7/20
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 870us/step - 

In [33]:
def get_top_n_recommendations(user_id, n=10):
    if user_id not in unique_user_ids:
        raise ValueError("User ID is not valid.")
    
    user_index = unique_user_ids[user_id]
    movie_indices = np.arange(len(movie_ids))

    user_data = np.tile(user_features_scaled[user_index], (len(movie_indices), 1))
    movie_data = movie_features_scaled

    predicted_ratings = model.predict([user_data, movie_data, np.full(len(movie_ids), user_index), movie_indices])

    predicted_ratings = np.clip(predicted_ratings, 0, 5)

    predicted_ratings = np.round(predicted_ratings * 2) / 2

    predicted_ratings = [(movies['title'].iloc[i], predicted_ratings[i][0]) for i in range(len(movie_ids))]

    top_n_movies = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:n]
    
    for title, rating in top_n_movies:
        print(f"{title}: {rating:.1f}")

In [34]:
get_top_n_recommendations(user_id=2, n=10)

[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 594us/step
Melinda and Melinda (2004): 5.0
Slumdog Millionaire (2008): 5.0
Highlander: The Search for Vengeance (2007): 5.0
Dumb and Dumber To (2014): 5.0
City of Lost Children, The (Cité des enfants perdus, La) (1995): 4.5
Mad Dog and Glory (1993): 4.5
District 9 (2009): 4.5
Prime Suspect: The Lost Child (1995): 4.5
Freezer (2014): 4.5
Grumpier Old Men (1995): 4.0
