In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datetime import datetime
import re
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
import requests
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')


KeyboardInterrupt: 

In [None]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')


In [None]:
links['description'] = ''
# Fetching descriptions from TMDB API
# Using https://www.codespeedy.com/fetch-tmdb-movie-data-using-python/
def fetch_description(tmdb_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
    params = {'api_key': api_key}
    response = requests.get(url, params=params)
    data = response.json()
    return data.get('overview', '') 
# Applyiing function to each row
api_key = 'ab591caa973b321e21e39ee9544ce7ed'  
for index, row in links.iterrows():
    description = fetch_description(row['tmdbId'], api_key)
    links.at[index, 'description'] = description

# Save & loading dataset with descriptions
links.to_csv('links_with_descriptions.csv', index=False)
links = pd.read_csv('links_with_descriptions.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,description
0,1,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
1,2,113497,8844.0,When siblings Judy and Peter discover an encha...
2,3,113228,15602.0,A family wedding reignites the ancient feud be...
3,4,114885,31357.0,"Cheated on, mistreated and stepped on, the wom..."
4,5,113041,11862.0,Just when George Banks has recovered from his ...


## Data Cleaning

In [None]:
# Checking and removing NaNs
movies_nan = movies.isna().sum()
ratings_nan = ratings.isna().sum()
tags_nan = tags.isna().sum()
links_nan = links.isna().sum()
print("NaN counts in movies:\n", movies_nan)
print("\nNaN counts in ratings:\n", ratings_nan)
print("\nNaN counts in tags:\n", tags_nan)
print("\nNaN counts in links:\n", links_nan)

# TMDB ID the only column with NaNs
links['tmdbId'].fillna('Unknown', inplace=True)
links['description'].fillna('No Description', inplace=True)



NaN counts in movies:
 movieId    0
title      0
genres     0
dtype: int64

NaN counts in ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

NaN counts in tags:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

NaN counts in links:
 movieId          0
imdbId           0
tmdbId           8
description    125
dtype: int64


  links['tmdbId'].fillna('Unknown', inplace=True)


In [None]:
# Removing duplicates
ratings.drop_duplicates(subset=['userId', 'movieId'], inplace=True)
tags.drop_duplicates(subset=['userId', 'movieId', 'tag'], inplace=True)

# Data type conversion
movies['movieId'] = movies['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)

# Standardising text
movies['title'] = movies['title'].str.lower()
tags['tag'] = tags['tag'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x).lower())
movies['genres'] = movies['genres'].str.split('|')

# Extract year from the title and create a new 'year' column
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

# Remove the year from the 'title' column
movies['title'] = movies['title'].str.rsplit(' (', n=1).str[0]

# Changing format of timestamp
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')




## Data Merging

In [None]:
movie_ratings = pd.merge(movies, ratings, on='movieId')
movie_ratings_tags = pd.merge(movie_ratings, tags, on=['movieId', 'userId'], how='left')
final_dataset = pd.merge(movie_ratings_tags, links, on='movieId', how='left')

In [None]:
# Checking NaNs
final_dataset_nan = final_dataset.isna().sum()
print("NaN counts in movies:\n", final_dataset_nan)

# Counting non-NaN values in the 'tag' and 'timestamp_y' columns
non_nan_count_tag = final_dataset['tag'].notna().sum()
print("Number of non-NaN values in 'tag':", non_nan_count_tag)

non_nan_count_timestamp_y = final_dataset['timestamp_y'].notna().sum()
print("Number of non-NaN values in 'timestamp_y':", non_nan_count_timestamp_y)

# Fill NaN values in 'tag' with 'No Tag'
final_dataset['tag'].fillna('No Tag', inplace=True)

# Drop 'timestamp_y'
final_dataset.drop(columns=['timestamp_y'], inplace=True)
final_dataset = final_dataset.rename(columns={'timestamp_x': 'timestamp'})
final_dataset.head()

NaN counts in movies:
 movieId            0
title              0
genres             0
year              20
userId             0
rating             0
timestamp_x        0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId             0
description        0
dtype: int64
Number of non-NaN values in 'tag': 3476
Number of non-NaN values in 'timestamp_y': 3476


Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp,tag,imdbId,tmdbId,description
0,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1,4.0,2000-07-30 18:45:03,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
1,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,5,4.0,1996-11-08 06:36:02,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
2,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,7,4.5,2005-01-25 06:52:26,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
3,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,15,2.5,2017-11-13 12:59:30,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
4,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,17,4.5,2011-05-18 05:28:03,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."


## RS2- Neural Networks with trend analysis

In [None]:
#Preprocessing for NLP
def clean_genre_list(genre_list):
    # Join the list into a string
    genre_string = ' '.join(genre_list)
    # Clean the string
    return clean_text(genre_string)

# Apply the cleaning function to the 'genres' column
final_dataset['genres'] = final_dataset['genres'].apply(clean_genre_list)

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove stop words (optional)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text
final_dataset['description'] = final_dataset['description'].apply(clean_text)
final_dataset['genres'] = final_dataset['genres'].apply(clean_text)
final_dataset['tag'] = final_dataset['tag'].apply(clean_text)
final_dataset['combined_features'] = final_dataset['genres'] + ' ' + final_dataset['tag'].fillna('') + ' ' + final_dataset['description']

In [None]:
from sklearn.preprocessing import LabelEncoder
user_id_encoder = LabelEncoder()
final_dataset['userId'] = user_id_encoder.fit_transform(final_dataset['userId'])

# Re-map movie IDs
movie_id_encoder = LabelEncoder()
final_dataset['movieId'] = movie_id_encoder.fit_transform(final_dataset['movieId'])

# Now your num_users and num_movies will be
num_users = final_dataset['userId'].nunique()
num_movies = final_dataset['movieId'].nunique()
# Check the max IDs to ensure they are within bounds
max_user_id = final_dataset['userId'].max()
max_movie_id = final_dataset['movieId'].max()
print(f"Max User ID: {max_user_id}, Number of Users: {num_users}")
print(f"Max Movie ID: {max_movie_id}, Number of Movies: {num_movies}")


Max User ID: 609, Number of Users: 610
Max Movie ID: 9723, Number of Movies: 9724


## NLP for descriptions

In [None]:
# Step 1: Import TensorFlow and other required libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.model_selection import train_test_split


# Define the model architecture
def setup_neural_network(num_users, num_movies, embedding_size, nlp_feature_dim):
    # User and Movie Embeddings
    user_input = Input(shape=(1,))
    
    user_embedding = Embedding(num_users+ 1, embedding_size, input_length=1)(user_input)
    user_vec = Flatten()(user_embedding)
    
    movie_input = Input(shape=(1,))
    movie_embedding = Embedding(num_movies + 1, embedding_size, input_length=1)(movie_input)
    movie_vec = Flatten()(movie_embedding)

    # NLP Feature Input for movie descriptions
    
    nlp_input = Input(shape=(nlp_feature_dim,))
    nlp_dense = Dense(embedding_size, activation='relu')(nlp_input)

    # Combine Features
    combined = Concatenate()([user_vec, movie_vec, nlp_dense])

    # Fully Connected Layers
    dense = Dense(128, activation='relu')(combined)
    prediction = Dense(1)(dense)

    # Create and compile the model
    model = Model(inputs=[user_input, movie_input, nlp_input], outputs=prediction)
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

num_users = final_dataset['userId'].nunique()
num_movies = final_dataset['movieId'].nunique()
embedding_size = 100 
nlp_feature_dim = 500


tfidf = TfidfVectorizer(max_features=nlp_feature_dim)
tfidf_matrix = tfidf.fit_transform(final_dataset['combined_features'].fillna(''))

train_data, test_data = train_test_split(final_dataset, test_size=0.2)

x_train = [
    train_data['userId'].values,
    train_data['movieId'].values,
    tfidf_matrix[train_data.index].toarray()
]
y_train = train_data['rating'].values
tfidf_matrix_test = tfidf.transform(final_dataset.loc[test_data.index, 'combined_features'].fillna(''))
# Prepare testing data
x_test = [
    test_data['userId'].values,
    test_data['movieId'].values,
    tfidf_matrix[test_data.index].toarray()
]
y_test = test_data['rating'].values
# Check the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Ensure the feature dimension matches
print("NLP feature dimension:", nlp_feature_dim)

# Check the shapes of x_train and x_test arrays
print("Shapes of x_train arrays:", [arr.shape for arr in x_train])
print("Shapes of x_test arrays:", [arr.shape for arr in x_test])


model = setup_neural_network(num_users, num_movies, embedding_size, nlp_feature_dim)
model.fit(x_train, y_train, epochs=5, batch_size=32)

model.evaluate(x_test, y_test)




TF-IDF matrix shape: (102677, 500)
NLP feature dimension: 500
Shapes of x_train arrays: [(82141,), (82141,), (82141, 500)]
Shapes of x_test arrays: [(20536,), (20536,), (20536, 500)]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.7378079295158386

In [None]:
# Example: Aggregate ratings by month for each movie
ratings['month_year'] = ratings['timestamp'].dt.to_period('M')
monthly_ratings = ratings.groupby(['movieId', 'month_year']).size().unstack(fill_value=0)

# Normalize the data (optional)
# You can use Min-Max scaling or another appropriate method here
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Assuming monthly_ratings is your prepared time-series data
# Reshape data for LSTM (samples, time steps, features)
X = monthly_ratings.values.reshape((monthly_ratings.shape[0], monthly_ratings.shape[1], 1))
monthly_diff = monthly_ratings.shift(-1, axis=1) - monthly_ratings

# Define a threshold for what you consider as 'trending'
trend_threshold = 10  # Example threshold

# Create a binary trend indicator (1 for trending, 0 for not trending)
# Here, we consider a movie 'trending' if its rating count increases by trend_threshold from one month to the next
is_trending = (monthly_diff >= trend_threshold).any(axis=1).astype(int)

# Assign the binary trend indicator to y
y = is_trending.values
 # Define your target variable (e.g., future ratings, trend indicator)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build and train the LSTM model
lstm_model = build_lstm_model((X_train.shape[1], 1))
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32)
# Predict trends
predicted_trends = lstm_model.predict(X)

# Integrate these trends with your recommendation system
# For example, adjust the predicted ratings based on the trend
# ...

# Proceed with generating ranked recommendations as before



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def get_lstm_input(movie_id, monthly_ratings):
    print("Getting LSTM input for movie ID:")
    # print(movie_id)
    print(monthly_ratings)
    # print(monthly_ratings[monthly_ratings.movie_id == movie_id])

    lstm_input = monthly_ratings.loc[movie_id].values.reshape((1, -1, 1))
    return lstm_input

def get_unified_recommendations(user_id, model, lstm_model, tfidf, final_dataset, top_n=10):
    # Fetch movies not yet rated by the user
    user_movies = ratings[ratings['userId'] == user_id]['movieId']
    movies_to_predict = final_dataset[~final_dataset['movieId'].isin(user_movies)]

    # Prepare data for the NLP-based model
    tfidf_features = tfidf.transform(movies_to_predict['combined_features'].fillna('')).toarray()

    x_user_movies = [
        np.full(tfidf_features.shape[0], user_id),  # User ID array
        movies_to_predict['movieId'].values,        # Movie IDs
        tfidf_features                              # TF-IDF features
    ]

    # Predict ratings with the NLP-based model
    predicted_ratings = model.predict(x_user_movies).flatten()

    # Get trend scores for these movies
    # Assuming lstm_model is your trained LSTM model for trend prediction
    # You will need to prepare the input for the LSTM model as per its requirements
    # Predict trend scores

    monthly_ratings_reindexed = monthly_ratings.reset_index()

    lstm_predictions = []
    for movie_id in movies_to_predict['movieId']:
        lstm_input = get_lstm_input(movie_id, monthly_ratings_reindexed)
        trend_score = lstm_model.predict(lstm_input)
        lstm_predictions.append(trend_score.flatten()[0])

    lstm_predictions = np.array(lstm_predictions)

    print(predicted_ratings)

    print(lstm_predictions)


    # Combine the predictions (you can adjust how you combine these)
    combined_scores = predicted_ratings + lstm_predictions

    # Create a DataFrame for sorting and filtering
    recommendations = pd.DataFrame({
        'movieId': movies_to_predict['movieId'],
        'predicted_rating': combined_scores
    })

    # Sort by combined score and fetch top N movies
    top_recommendations = recommendations.sort_values(by='predicted_rating', ascending=False).head(top_n)

    # Fetch movie details from the movies DataFrame
    movie_details = movies[movies['movieId'].isin(top_recommendations['movieId'])]

    return movie_details

# Example usage
user_id = 123  # Replace with the user ID for whom you want recommendations
unified_recommendations = get_unified_recommendations(user_id, model, lstm_model, tfidf, final_dataset, top_n=10)

# Display the recommendations
print(unified_recommendations)

Getting LSTM input for movie ID:
month_year  movieId  1996-03  1996-04  1996-05  1996-06  1996-07  1996-08  \
0                 1        0        3        8        8        1        0   
1                 2        0        1        4        2        2        4   
2                 3        0        0        0        0        0        2   
3                 4        0        0        0        2        1        1   
4                 5        0        1        2        0        1        1   
...             ...      ...      ...      ...      ...      ...      ...   
9719         193581        0        0        0        0        0        0   
9720         193583        0        0        0        0        0        0   
9721         193585        0        0        0        0        0        0   
9722         193587        0        0        0        0        0        0   
9723         193609        0        0        0        0        0        0   

month_year  1996-09  1996-10  1996-11  ...

KeyboardInterrupt: 

# Splitting Data & Evaluation (Copied over from RS1)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Function to hide some of a user's ratings
def hide_user_ratings(user_id, ratings, hide_ratio=0.25):
    user_ratings = ratings[ratings['userId'] == user_id]
    hide_indices = np.random.choice(user_ratings.index, size=int(len(user_ratings) * hide_ratio), replace=False)
    hidden_ratings = user_ratings.loc[hide_indices]
    visible_ratings = user_ratings.drop(hide_indices)
    return visible_ratings, hidden_ratings

# Function to calculate MAP
def calculate_map(recommendations, hidden_ids):
    relevant = 0
    total_precision = 0
    for k, rec in enumerate(recommendations['movieId'], start=1):
        if rec in hidden_ids:
            relevant += 1
            total_precision += relevant / k
    return total_precision / len(hidden_ids) if hidden_ids else 0

# Function to calculate NDCG
def calculate_ndcg(recommendations, hidden_ids, k=10):
    dcg = sum(1 / np.log2(i + 2) for i, rec in enumerate(recommendations['movieId'].head(k)) if rec in hidden_ids)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(hidden_ids), k)))
    return dcg / idcg if idcg > 0 else 0

# Function to calculate MRR
def calculate_mrr(recommendations, hidden_ids):
    for i, rec in enumerate(recommendations['movieId'], start=1):
        if rec in hidden_ids:
            return 1 / i
    return 0

# Function to calculate precision and recall
def calculate_precision_recall(recommendations, hidden_ratings, k=10):
    recommended_ids = set(recommendations['movieId'].head(k))
    hidden_ids = set(hidden_ratings['movieId'])
    hits = recommended_ids.intersection(hidden_ids)
    precision = len(hits) / len(recommended_ids)
    recall = len(hits) / len(hidden_ids) if hidden_ids else 0
    return precision, recall

# Split users into train and test sets
unique_users = ratings['userId'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.25, random_state=42)

# Initialize lists to store evaluation metrics for each user
precisions = []
recalls = []
mrrs = []
maps = []
ndcgs = []

# Evaluate recommendations for each user in the test set
for user_id in test_users:
    visible_ratings, hidden_ratings = hide_user_ratings(user_id, ratings)
    visible_ratings = pd.concat([ratings[ratings['userId'] != user_id], visible_ratings])

    recommendations = get_unified_recommendations(user_id, visible_ratings, trending_scores, tfidf_matrix, cosine_sim, movies, top_n=10)
    
    # Extract hidden_ids for the current user
    hidden_ids = set(hidden_ratings['movieId'])

    # Calculate metrics
    precision, recall = calculate_precision_recall(recommendations, hidden_ratings, k=10)
    mrr = calculate_mrr(recommendations, hidden_ids)
    map_score = calculate_map(recommendations, hidden_ids)
    ndcg = calculate_ndcg(recommendations, hidden_ids, k=10)

    # Append to lists
    precisions.append(precision)
    recalls.append(recall)
    mrrs.append(mrr)
    maps.append(map_score)
    ndcgs.append(ndcg)

# Calculate average of all metrics across all users
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_mrr = sum(mrrs) / len(mrrs)
avg_map = sum(maps) / len(maps)
avg_ndcg = sum(ndcgs) / len(ndcgs)

print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average MRR: {avg_mrr}')
print(f'Average MAP: {avg_map}')
print(f'Average NDCG: {avg_ndcg}')
