In [1]:
import pandas as pd

# Load the datasets
movies_df = pd.read_csv('movies_sample_train (2).csv')
users_df = pd.read_csv('users_sample_train (1).csv')

# Display the first few rows of each dataset to understand their structure
movies_df_head = movies_df.head()
users_df_head = users_df.head()

movies_df_shape = movies_df.shape
users_df_shape = users_df.shape

movies_df_head, users_df_head, movies_df_shape, users_df_shape

(   Unnamed: 0     tconst  averageRating  numVotes            directors  \
 0        4726  tt0015384            6.0      1515            nm0250873   
 1        6563  tt0019422            7.5     11821  nm0412650,nm0000370   
 2       10283  tt0024727            5.7      3186            nm0833965   
 3       10350  tt0024816            6.0       504            nm0478441   
 4       10387  tt0024865            6.9      2191            nm0291548   
 
                                    writers  titleType  \
 0                                  Unknown          1   
 1                      nm0000370,nm0412650          1   
 2                                nm0522871          0   
 3            nm0580648,nm0562372,nm0208691          0   
 4  nm0078667,nm0883328,nm0921995,nm0829330          0   
 
                      primaryTitle                   originalTitle  isAdult  \
 0               Diagonal Symphony             Symphonie diagonale        0   
 1                Steamboat Willie      

In [2]:
import ast

# Convert string representations of lists to actual lists in users_df
users_df['movie_ids'] = users_df['movie_ids'].apply(ast.literal_eval)
users_df['user_ratings'] = users_df['user_ratings'].apply(ast.literal_eval)

# For 'user_reviews', extract the tensor values
def parse_tensor(tensor_str):
    # Remove 'tensor([' and '])' and split by comma
    values = tensor_str.replace('tensor([', '').replace('])', '').strip()
    return [float(x) for x in values.split(',') if x.strip()]

users_df['user_reviews'] = users_df['user_reviews'].apply(parse_tensor)

# Check the processed data
users_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,movie_ids,user_ratings,user_reviews
0,142,ur0017155,"[tt0031657, tt0066952]","[6, 7]","[0.162, -0.43063, -0.21882, 0.35712, 0.37678, ..."
1,4831,ur0518971,"[tt0054013, tt0054903]","[7, 2]","[0.013101, -0.23237, 0.55675, 0.31472, 0.36427..."
2,7267,ur0881948,[tt0037929],[10],"[-0.13836, -0.24785, -0.50175, 0.072433, 0.112..."
3,11507,ur120947285,"[tt0024816, tt0024865, tt0026104, tt0026121, t...","[7, 10, 8, 4, 10, 6, 9, 3, 7, 5, 10, 7, 9, 10,...","[-0.48468, -0.3665, -0.19936, 0.077204, 0.0884..."
4,13529,ur1335529,"[tt0048573, tt0066732]","[4, 3]","[0.085681, 0.12978, -0.42136, 0.26619, 0.03643..."


In [3]:
# Function to split user_reviews if possible
def split_reviews(row):
    num_movies = len(row['movie_ids'])
    reviews = row['user_reviews']

    if len(reviews) % num_movies == 0:
        split_size = len(reviews) // num_movies
        return [reviews[i * split_size: (i + 1) * split_size] for i in range(num_movies)]
    else:
        return None  # Indicates problematic row

# Apply the function
users_df['split_reviews'] = users_df.apply(split_reviews, axis=1)

# Identify rows that couldn't be fixed
unfixable_rows = users_df[users_df['split_reviews'].isnull()]

# Remove unfixable rows and update user_reviews with split_reviews
users_df = users_df[users_df['split_reviews'].notnull()]
users_df['user_reviews'] = users_df['split_reviews']

# Drop the temporary column
users_df = users_df.drop(columns=['split_reviews'])

# Check the cleaned data
users_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_df['user_reviews'] = users_df['split_reviews']


Unnamed: 0.1,Unnamed: 0,user_id,movie_ids,user_ratings,user_reviews
0,142,ur0017155,"[tt0031657, tt0066952]","[6, 7]","[[0.162, -0.43063, -0.21882, 0.35712, 0.37678,..."
1,4831,ur0518971,"[tt0054013, tt0054903]","[7, 2]","[[0.013101, -0.23237, 0.55675, 0.31472, 0.3642..."
2,7267,ur0881948,[tt0037929],[10],"[[-0.13836, -0.24785, -0.50175, 0.072433, 0.11..."
3,11507,ur120947285,"[tt0024816, tt0024865, tt0026104, tt0026121, t...","[7, 10, 8, 4, 10, 6, 9, 3, 7, 5, 10, 7, 9, 10,...","[[-0.48468, -0.3665, -0.19936, 0.077204, 0.088..."
4,13529,ur1335529,"[tt0048573, tt0066732]","[4, 3]","[[0.085681, 0.12978, -0.42136, 0.26619, 0.0364..."


In [9]:
# Explode the user dataframe to have one row per movie rating and review
users_exploded = users_df.explode(['movie_ids', 'user_ratings', 'user_reviews'])

# Merge with the movies dataset on movie_ids (tconst in movies_df)
merged_df = pd.merge(users_exploded, movies_df, left_on='movie_ids', right_on='tconst', how='inner')



In [10]:
import numpy as np
# Function to pad tensors to length 768
def pad_tensor(tensor, target_length=768):
    if len(tensor) < target_length:
        return np.pad(tensor, (0, target_length - len(tensor)), 'constant')
    return tensor[:target_length]  # Truncate if longer than 768

from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Combine genre columns into a list for one-hot encoding
merged_df['genres'] = merged_df[['genre1', 'genre2', 'genre3']].values.tolist()
merged_df['genres'] = merged_df['genres'].apply(lambda x: [g for g in x if pd.notnull(g)])

# One-hot encode genres
mlb_genres = MultiLabelBinarizer()
genre_encoded = mlb_genres.fit_transform(merged_df['genres'])

# Process directors and writers (split by comma and one-hot encode)
merged_df['directors_list'] = merged_df['directors'].apply(lambda x: x.split(',') if pd.notnull(x) else [])
merged_df['writers_list'] = merged_df['writers'].apply(lambda x: x.split(',') if pd.notnull(x) else [])

mlb_directors = MultiLabelBinarizer()
mlb_writers = MultiLabelBinarizer()

directors_encoded = mlb_directors.fit_transform(merged_df['directors_list'])
writers_encoded = mlb_writers.fit_transform(merged_df['writers_list'])
# Apply padding
merged_df['user_reviews_padded'] = merged_df['user_reviews'].apply(pad_tensor)

# Concatenate all features
X = np.concatenate([
    np.vstack(merged_df['user_reviews_padded'].values),  # Padded user review embeddings
    genre_encoded,
    directors_encoded,
    writers_encoded
], axis=1)

# Target variable
y = merged_df['user_ratings'].astype(float).values

# Check the shape of the feature matrix and target vector
X.shape, y.shape

((33, 966), (33,))

In [13]:
merged_df.head()

Unnamed: 0,Unnamed: 0_x,user_id,movie_ids,user_ratings,user_reviews,Unnamed: 0_y,tconst,averageRating,numVotes,directors,...,startYear,endYear,runtimeMinutes,genre1,genre2,genre3,genres,directors_list,writers_list,user_reviews_padded
0,142,ur0017155,tt0031657,6,"[0.162, -0.43063, -0.21882, 0.35712, 0.37678, ...",15794,tt0031657,6.2,862,nm0115218,...,1939,,71.0,Crime,Mystery,,"[Crime, Mystery]",[nm0115218],"[nm0750822, nm0663763, nm0335455, nm0712692]","[0.162, -0.43063, -0.21882, 0.35712, 0.37678, ..."
1,142,ur0017155,tt0066952,7,"[-0.13202, 0.54789, -0.3016, 0.22824, -0.27197...",46107,tt0066952,5.7,3184,nm0446059,...,1970,,93.0,Drama,Fantasy,Horror,"[Drama, Fantasy, Horror]",[nm0446059],[nm0446059],"[-0.13202, 0.54789, -0.3016, 0.22824, -0.27197..."
2,4831,ur0518971,tt0054013,7,"[0.013101, -0.23237, 0.55675, 0.31472, 0.36427...",34727,tt0054013,6.3,707,"nm0883213,nm0523893",...,1961,,100.0,Adventure,Family,Fantasy,"[Adventure, Family, Fantasy]","[nm0883213, nm0523893]","[nm0292295, nm0762822, nm0883213, nm0001196]","[0.013101, -0.23237, 0.55675, 0.31472, 0.36427..."
3,4831,ur0518971,tt0054903,2,"[-0.12702, 0.49662, -0.30249, 0.56733, 0.03592...",35518,tt0054903,4.6,250,nm0040789,...,1962,,97.0,Adventure,,,[Adventure],[nm0040789],"[nm1597650, nm0220383, nm0366361, nm1290772, n...","[-0.12702, 0.49662, -0.30249, 0.56733, 0.03592..."
4,7267,ur0881948,tt0037929,10,"[-0.13836, -0.24785, -0.50175, 0.072433, 0.112...",21091,tt0037929,7.8,1788,"nm0053484,nm0360253",...,1945,,8.0,Animation,Comedy,Family,"[Animation, Comedy, Family]","[nm0053484, nm0360253]",[nm0542810],"[-0.13836, -0.24785, -0.50175, 0.072433, 0.112..."


In [14]:
merged_df.columns

Index(['Unnamed: 0_x', 'user_id', 'movie_ids', 'user_ratings', 'user_reviews',
       'Unnamed: 0_y', 'tconst', 'averageRating', 'numVotes', 'directors',
       'writers', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genre1', 'genre2', 'genre3',
       'genres', 'directors_list', 'writers_list', 'user_reviews_padded'],
      dtype='object')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Build the neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score,r2_score

model = Sequential([
    Dense(512, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1)  # Output layer
])

# Compile and train
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

history = model.fit(X_train, y_train, epochs=100, batch_size=4, validation_data=(X_test, y_test))

# Evaluate
# Evaluate
loss, mae = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

# RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Precision@K, Recall@K, Accuracy@K
K = 5
sorted_indices = np.argsort(y_pred.flatten())[::-1][:K]

y_true_top_k = (y_test[sorted_indices] >= 4).astype(int)  # Assuming rating >=4 is positive
y_pred_top_k = (y_pred.flatten()[sorted_indices] >= 4).astype(int)

precision_at_k = precision_score(y_true_top_k, y_pred_top_k, zero_division=0)
recall_at_k = recall_score(y_true_top_k, y_pred_top_k, zero_division=0)
accuracy_at_k = accuracy_score(y_true_top_k, y_pred_top_k)

print(f'Loss: {loss}, MAE: {mae}, R2: {r2}, rmse: {rmse}')
print(f'Precision@{K}: {precision_at_k}, Recall@{K}: {recall_at_k}, Accuracy@{K}: {accuracy_at_k}')



Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 158ms/step - loss: 38.8630 - mae: 5.6946 - val_loss: 37.4443 - val_mae: 5.4649
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 17.4201 - mae: 3.5958 - val_loss: 25.6735 - val_mae: 4.4713
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 12.6321 - mae: 2.8322 - val_loss: 21.0010 - val_mae: 4.1018
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 3.1067 - mae: 1.4420 - val_loss: 14.3149 - val_mae: 3.2071
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - loss: 2.7050 - mae: 1.3024 - val_loss: 15.4197 - val_mae: 3.4403
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 2.3071 - mae: 1.1480 - val_loss: 16.8575 - val_mae: 3.6213
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.8496 - m

In [23]:
# Recommendation Function
def recommend_movies(fav_genres, fav_directors, fav_writers, top_n=5):
    genre_vector = mlb_genres.transform([fav_genres])
    director_vector = mlb_directors.transform([fav_directors])
    writer_vector = mlb_writers.transform([fav_writers])

    recommendations = []
    for index, row in merged_df.iterrows():
        features = np.concatenate([
            row['user_reviews_padded'],
            genre_vector[0],
            director_vector[0],
            writer_vector[0]
        ]).reshape(1, -1)
        predicted_rating = model.predict(features)[0][0]
        recommendations.append((row['primaryTitle'], predicted_rating))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[:top_n]

In [39]:
# Example usage
fav_genres = ['Comedy', 'Drama', 'Action']
fav_directors = ['nm0412650', 'nm0000370']
fav_writers = ['nm0522871', 'nm0250873']

recommendations = recommend_movies(fav_genres, fav_directors, fav_writers)
for movie, score in recommendations:
    print(f"{movie}: {score}")
ratings = [rating for _, rating in recommendations]
print(np.mean(ratings))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

In [25]:
def recommend_users(movie_genres, movie_directors, movie_writers, top_n=5):
    genre_vector = mlb_genres.transform([movie_genres])
    director_vector = mlb_directors.transform([movie_directors])
    writer_vector = mlb_writers.transform([movie_writers])

    recommendations = []
    for index, row in merged_df.iterrows():
        features = np.concatenate([
            row['user_reviews_padded'],
            genre_vector[0],
            director_vector[0],
            writer_vector[0]
        ]).reshape(1, -1)
        predicted_rating = model.predict(features)[0][0]
        recommendations.append((row['user_id'], predicted_rating))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[:top_n]

In [26]:
# Example usage for user recommendations
movie_genres = ['Action', 'Thriller']
movie_directors = ['nm0883213']
movie_writers = ['nm0522871']

user_recommendations = recommend_users(movie_genres, movie_directors, movie_writers)
for user, score in user_recommendations:
    print(f"User {user}: {score}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [36]:
merged_df[merged_df['user_id'] == 'ur0881948']

Unnamed: 0,Unnamed: 0_x,user_id,movie_ids,user_ratings,user_reviews,Unnamed: 0_y,tconst,averageRating,numVotes,directors,...,startYear,endYear,runtimeMinutes,genre1,genre2,genre3,genres,directors_list,writers_list,user_reviews_padded
4,7267,ur0881948,tt0037929,10,"[-0.13836, -0.24785, -0.50175, 0.072433, 0.112...",21091,tt0037929,7.8,1788,"nm0053484,nm0360253",...,1945,,8.0,Animation,Comedy,Family,"[Animation, Comedy, Family]","[nm0053484, nm0360253]",[nm0542810],"[-0.13836, -0.24785, -0.50175, 0.072433, 0.112..."
