In [None]:
# mount drive and dataset location

# training_user_data_loc = "drive/MyDrive/MSUoA/users_data_bert_embeddings.csv"
training_movies_data_loc = "drive/MyDrive/MSUoA/sample_movies.csv"
training_user_data_loc = "drive/MyDrive/MSUoA/training_users_data.csv"

In [None]:
# declarations
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import ast
import os
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score,r2_score

In [None]:
user_df = pd.read_csv(training_user_data_loc, index_col=0)
movies_df = pd.read_csv(training_movies_data_loc, index_col=0)
movies_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0','titleType', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes'], inplace=True)
movies_df.rename(columns={'averageRating': 'imdb_rating'}, inplace=True)

In [None]:
movies_df['genre1'].replace({'\\N':None}, inplace=True)
movies_df['genre2'].replace({'\\N':None}, inplace=True)
movies_df['genre3'].replace({'\\N':None}, inplace=True)
movies_df['genre3'].replace({'Short':None}, inplace=True)
movies_df['genre2'].replace({'Short':None}, inplace=True)
movies_df['genre1'].replace({'Short':None}, inplace=True)
movies_df['writers'].replace({'\\N':None}, inplace=True)
movies_df['directors'].replace({'\\N':None}, inplace=True)

user_df['user_ratings'] = user_df['user_ratings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
user_df['movie_ids'] = user_df['movie_ids'].apply(ast.literal_eval)

scaler = MinMaxScaler()
all_ratings = [rating for sublist in user_df['user_ratings'] for rating in sublist]
scaled_ratings = scaler.fit_transform(np.array(all_ratings).reshape(-1, 1)).flatten()
# Apply normalized ratings
user_df = user_df.reset_index(drop=True)
index = 0
for i in range(len(user_df['user_ratings'])):
    length = len(user_df['user_ratings'][i])
    user_df.at[i, 'user_ratings'] = scaled_ratings[index:index+length].tolist()
    index += length

movies_df['imdb_rating'] = scaler.fit_transform(movies_df[['imdb_rating']])
movies_df['numVotes'] = scaler.fit_transform(movies_df[['numVotes']])


def parse_tensor(tensor_str):
    # Remove 'tensor([' and '])' and split by comma
    values = tensor_str.replace('tensor([', '').replace('])', '').strip()
    return [float(x) for x in values.split(',') if x.strip()]

user_df['user_reviews'] = user_df['user_reviews'].apply(parse_tensor)


def split_reviews(row):
    num_movies = len(row['movie_ids'])
    reviews = row['user_reviews']

    if len(reviews) % num_movies == 0:
        split_size = len(reviews) // num_movies
        return [reviews[i * split_size: (i + 1) * split_size] for i in range(num_movies)]
    else:
        return None  # Indicates problematic row

# Apply the function
user_df['split_reviews'] = user_df.apply(split_reviews, axis=1)

# Identify rows that couldn't be fixed
unfixable_rows = user_df[user_df['split_reviews'].isnull()]

# Remove unfixable rows and update user_reviews with split_reviews
user_df = user_df[user_df['split_reviews'].notnull()]
user_df['user_reviews'] = user_df['split_reviews']

# Drop the temporary column
user_df = user_df.drop(columns=['split_reviews'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['genre1'].replace({'\\N':None}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['genre2'].replace({'\\N':None}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [None]:
users_exploded = user_df.explode(['movie_ids', 'user_ratings', 'user_reviews'])
merged_df = pd.merge(users_exploded, movies_df, left_on='movie_ids', right_on='tconst', how='inner')

# Function to pad tensors to length 768
def pad_tensor(tensor, target_length=768):
    if len(tensor) < target_length:
        return np.pad(tensor, (0, target_length - len(tensor)), 'constant')
    return tensor[:target_length]  # Truncate if longer than 768


# Combine genre columns into a list for one-hot encoding
merged_df['genres'] = merged_df[['genre1', 'genre2', 'genre3']].values.tolist()
merged_df['genres'] = merged_df['genres'].apply(lambda x: [g for g in x if pd.notnull(g)])

# One-hot encode genres
mlb_genres = MultiLabelBinarizer()
genre_encoded = mlb_genres.fit_transform(merged_df['genres'])

# Process directors and writers (split by comma and one-hot encode)
merged_df['directors_list'] = merged_df['directors'].apply(lambda x: x.split(',') if pd.notnull(x) else [])
merged_df['writers_list'] = merged_df['writers'].apply(lambda x: x.split(',') if pd.notnull(x) else [])

mlb_directors = MultiLabelBinarizer()
mlb_writers = MultiLabelBinarizer()

directors_encoded = mlb_directors.fit_transform(merged_df['directors_list'])
writers_encoded = mlb_writers.fit_transform(merged_df['writers_list'])
# Apply padding
merged_df['user_reviews_padded'] = merged_df['user_reviews'].apply(pad_tensor)
merged_df.drop(columns=['tconst', 'genre1', 'genre2', 'genre3', 'writers', 'directors','user_reviews'], inplace=True)

In [None]:
merged_df.head()

Unnamed: 0,user_id,movie_ids,user_ratings,imdb_rating,numVotes,primaryTitle,genres,directors_list,writers_list,user_reviews_padded
0,ur186072342,tt0015384,0.0,0.571429,0.000722,Diagonal Symphony,"[Animation, Music]",[nm0250873],[],"[0.26374, -0.070189, 0.17799, 0.18658, 0.42948..."
1,ur186072342,tt0019422,0.888889,0.766234,0.005652,Steamboat Willie,"[Animation, Comedy, Family]","[nm0412650, nm0000370]","[nm0000370, nm0412650]","[-0.15262, 0.20243, 0.73813, 0.19402, -0.02785..."
2,ur186072342,tt0029583,0.666667,0.779221,0.106084,Snow White and the Seven Dwarfs,"[Adventure, Animation, Family]","[nm0183183, nm0359457, nm0414144, nm0604392, n...","[nm0342278, nm0342303, nm0780799, nm0187232, n...","[-0.01072, -0.37331, 0.16924, 0.15325, -0.1456..."
3,ur186072342,tt0032910,1.0,0.766234,0.078725,Pinocchio,"[Adventure, Animation, Comedy]","[nm0272568, nm0373429, nm0414144, nm0455741, n...","[nm0172830, nm0780799, nm0257481, nm0810324, n...","[-0.043972, 0.28374, 0.13589, 0.57326, 0.14113..."
4,ur186072342,tt0040580,0.555556,0.584416,0.003482,Melody Time,"[Animation, Comedy, Family]","[nm0314671, nm0414144, nm0455741, nm0527217]","[nm0382548, nm0672093, nm0716206, nm0109205, n...","[0.54501, 0.83112, 0.45529, 0.43781, -0.010706..."


In [None]:
merged_df.shape

(41360, 10)

In [None]:
merged_df.to_csv('merged_dataset.csv')