In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datetime import datetime
import re
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
import requests
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# from sklearn.surprise import Dataset, Reader
# from surprise.model_selection import train_test_split
# from surprise import SVD

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\betht\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\betht\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')


In [40]:
links['description'] = ''

# Define a function to fetch descriptions using TMDb API
def fetch_description(tmdb_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
    params = {'api_key': api_key}
    response = requests.get(url, params=params)
    data = response.json()
    return data.get('overview', '')  # Return the movie description

# Apply the function to each row in the DataFrame to fetch descriptions
api_key = 'ab591caa973b321e21e39ee9544ce7ed'  # Replace with your actual API key
for index, row in links.iterrows():
    description = fetch_description(row['tmdbId'], api_key)
    links.at[index, 'description'] = description

# Save/load your dataset with descriptions
links.to_csv('links_with_descriptions.csv', index=False)
links = pd.read_csv('links_with_descriptions.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,description
0,1,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
1,2,113497,8844.0,When siblings Judy and Peter discover an encha...
2,3,113228,15602.0,A family wedding reignites the ancient feud be...
3,4,114885,31357.0,"Cheated on, mistreated and stepped on, the wom..."
4,5,113041,11862.0,Just when George Banks has recovered from his ...


## Data Cleaning

In [41]:
# Checking and removing NaNs
movies_nan = movies.isna().sum()
ratings_nan = ratings.isna().sum()
tags_nan = tags.isna().sum()
links_nan = links.isna().sum()
print("NaN counts in movies:\n", movies_nan)
print("\nNaN counts in ratings:\n", ratings_nan)
print("\nNaN counts in tags:\n", tags_nan)
print("\nNaN counts in links:\n", links_nan)

# TMDB ID the only column with NaNs
links['tmdbId'].fillna('Unknown', inplace=True)
links['description'].fillna('No Description', inplace=True)



NaN counts in movies:
 movieId    0
title      0
genres     0
dtype: int64

NaN counts in ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

NaN counts in tags:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

NaN counts in links:
 movieId          0
imdbId           0
tmdbId           8
description    124
dtype: int64


  links['tmdbId'].fillna('Unknown', inplace=True)


In [42]:
# Removing duplicates
ratings.drop_duplicates(subset=['userId', 'movieId'], inplace=True)
tags.drop_duplicates(subset=['userId', 'movieId', 'tag'], inplace=True)

# Data type conversion
movies['movieId'] = movies['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)

# Standardising text
movies['title'] = movies['title'].str.lower()
tags['tag'] = tags['tag'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x).lower())
movies['genres'] = movies['genres'].str.split('|')

# Extract year from the title and create a new 'year' column
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

# Remove the year from the 'title' column
movies['title'] = movies['title'].str.rsplit(' (', n=1).str[0]

# Changing format of timestamp
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')




## Data Merging

In [43]:
movie_ratings = pd.merge(movies, ratings, on='movieId')
movie_ratings_tags = pd.merge(movie_ratings, tags, on=['movieId', 'userId'], how='left')
final_dataset = pd.merge(movie_ratings_tags, links, on='movieId', how='left')
final_dataset.head()

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp_x,tag,timestamp_y,imdbId,tmdbId,description
0,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1,4.0,2000-07-30 18:45:03,,NaT,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
1,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,5,4.0,1996-11-08 06:36:02,,NaT,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
2,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,7,4.5,2005-01-25 06:52:26,,NaT,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
3,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,15,2.5,2017-11-13 12:59:30,,NaT,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
4,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,17,4.5,2011-05-18 05:28:03,,NaT,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."


In [44]:
# Checking NaNs
final_dataset_nan = final_dataset.isna().sum()
print("NaN counts in movies:\n", final_dataset_nan)
# Counting non-NaN values in the 'tag' and 'timestamp_y' columns
non_nan_count_tag = final_dataset['tag'].notna().sum()
print("Number of non-NaN values in 'tag':", non_nan_count_tag)
non_nan_count_timestamp_y = final_dataset['timestamp_y'].notna().sum()
print("Number of non-NaN values in 'timestamp_y':", non_nan_count_timestamp_y)

# Fill NaN values in 'tag' with 'No Tag'
final_dataset['tag'].fillna('No Tag', inplace=True)

# Drop 'timestamp_y'
final_dataset.drop(columns=['timestamp_y'], inplace=True)
final_dataset = final_dataset.rename(columns={'timestamp_x': 'timestamp'})
final_dataset.head()

NaN counts in movies:
 movieId            0
title              0
genres             0
year              20
userId             0
rating             0
timestamp_x        0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId             0
description        0
dtype: int64
Number of non-NaN values in 'tag': 3476
Number of non-NaN values in 'timestamp_y': 3476


Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp,tag,imdbId,tmdbId,description
0,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1,4.0,2000-07-30 18:45:03,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
1,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,5,4.0,1996-11-08 06:36:02,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
2,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,7,4.5,2005-01-25 06:52:26,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
3,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,15,2.5,2017-11-13 12:59:30,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."
4,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,17,4.5,2011-05-18 05:28:03,No Tag,114709,862.0,"Led by Woody, Andy's toys live happily in his ..."


## Feature Engineering- Timestamps to reflect trends

In [45]:
# import openai

# def get_movie_recommendations(user_summary):
#     prompt = f"Based on the following user preferences: {user_summary}, what are some movie recommendations?"
    
#     response = openai.Completion.create(
#       engine="text-davinci-004",
#       prompt=prompt,
#       max_tokens=100
#     )

#     return response.choices[0].text.strip()

# # Example usage
# user_summary = "User A likes sci-fi movies, especially those directed by Christopher Nolan. They enjoyed 'Inception' and 'Interstellar' but didn't like '2001: A Space Odyssey'."
# recommendations = get_movie_recommendations(user_summary)
# print(recommendations)


## RS2- Neural Networks with trend analysis

In [47]:
def clean_genre_list(genre_list):
    # Join the list into a string
    genre_string = ' '.join(genre_list)
    # Clean the string
    return clean_text(genre_string)

# Apply the cleaning function to the 'genres' column
final_dataset['genres'] = final_dataset['genres'].apply(clean_genre_list)

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove stop words (optional)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text
final_dataset['description'] = final_dataset['description'].apply(clean_text)
final_dataset['genres'] = final_dataset['genres'].apply(clean_text)
final_dataset['tag'] = final_dataset['tag'].apply(clean_text)
final_dataset['combined_features'] = final_dataset['genres'] + ' ' + final_dataset['tag'].fillna('') + ' ' + final_dataset['description']

In [59]:
from sklearn.preprocessing import LabelEncoder
user_id_encoder = LabelEncoder()
ratings['userId'] = user_id_encoder.fit_transform(ratings['userId'])

# Re-map movie IDs
movie_id_encoder = LabelEncoder()
ratings['movieId'] = movie_id_encoder.fit_transform(ratings['movieId'])

# Now your num_users and num_movies will be
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()
# Check the max IDs to ensure they are within bounds
max_user_id = ratings['userId'].max()
max_movie_id = ratings['movieId'].max()
print(f"Max User ID: {max_user_id}, Number of Users: {num_users}")
print(f"Max Movie ID: {max_movie_id}, Number of Movies: {num_movies}")


Max User ID: 609, Number of Users: 610
Max Movie ID: 9723, Number of Movies: 9724


In [60]:
# Step 1: Import TensorFlow and other required libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.model_selection import train_test_split


# Define the model architecture
def setup_neural_network(num_users, num_movies, embedding_size, nlp_feature_dim):
    # User and Movie Embeddings
    user_input = Input(shape=(1,))
    
    user_embedding = Embedding(num_users+ 1, embedding_size, input_length=1)(user_input)
    user_vec = Flatten()(user_embedding)
    
    movie_input = Input(shape=(1,))
    movie_embedding = Embedding(num_movies + 1, embedding_size, input_length=1)(movie_input)
    movie_vec = Flatten()(movie_embedding)

    # NLP Feature Input for movie descriptions
    
    nlp_input = Input(shape=(nlp_feature_dim,))
    nlp_dense = Dense(embedding_size, activation='relu')(nlp_input)

    # Combine Features
    combined = Concatenate()([user_vec, movie_vec, nlp_dense])

    # Fully Connected Layers
    dense = Dense(128, activation='relu')(combined)
    prediction = Dense(1)(dense)

    # Create and compile the model
    model = Model(inputs=[user_input, movie_input, nlp_input], outputs=prediction)
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

# Step 2: Prepare your data
# You'll need to process your data to match the input shape expected by the model.
# This typically involves encoding categorical data, normalizing inputs, and splitting into training and test sets.
# Step 2: Preparing the Data
# ... previous data processing steps ...



# Step 3: Train the model
# Replace `x_train` and `y_train` with your actual training data and targets.
num_users = final_dataset['userId'].nunique()
num_movies = final_dataset['movieId'].nunique()
embedding_size = 100 #apparently best between 50 and 200
nlp_feature_dim = 500


# Process NLP features from movie descriptions using TF-IDF
tfidf = TfidfVectorizer(max_features=nlp_feature_dim)
tfidf_matrix = tfidf.fit_transform(final_dataset['combined_features'].fillna(''))
# Split data into training and testing sets
train_data, test_data = train_test_split(final_dataset, test_size=0.2)
# Prepare training data
x_train = [
    train_data['userId'].values,
    train_data['movieId'].values,
    tfidf_matrix[train_data.index].toarray()
]
y_train = train_data['rating'].values
tfidf_matrix_test = tfidf.transform(final_dataset.loc[test_data.index, 'combined_features'].fillna(''))
# Prepare testing data
x_test = [
    test_data['userId'].values,
    test_data['movieId'].values,
    tfidf_matrix[test_data.index].toarray()
]
y_test = test_data['rating'].values


# Then follow with Steps 3, 4, and 5 as before.
model = setup_neural_network(num_users, num_movies, embedding_size, nlp_feature_dim)
model.fit(x_train, y_train, epochs=5, batch_size=32)

# Step 4: Evaluate the model
# Replace `x_test` and `y_test` with your actual test data and targets.
model.evaluate(x_test, y_test)

# Step 5: Generate recommendations
# Use the model to predict ratings for user-movie pairs that are not in the training data
# and generate a list of recommended movies for each user.


Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'model_11/embedding_23/embedding_lookup' defined at (most recent call last):
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 542, in dispatch_queue
      await self.process_one()
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 531, in process_one
      await dispatch(*args)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 359, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 775, in execute_request
      reply_content = await reply_content
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 446, in do_execute
      res = shell.run_cell(
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3051, in run_cell
      result = self._run_cell(
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in _run_cell
      result = runner(coro)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3311, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3493, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\betht\AppData\Local\Temp\ipykernel_1020\715321410.py", line 78, in <module>
      model.fit(x_train, y_train, epochs=5, batch_size=32)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\betht\anaconda3\envs\myenv\envs\tf\lib\site-packages\keras\layers\core\embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model_11/embedding_23/embedding_lookup'
indices[3,0] = 69757 is not in [0, 9725)
	 [[{{node model_11/embedding_23/embedding_lookup}}]] [Op:__inference_train_function_4242]

## Content- based filtering

In [None]:
def get_content_based_recommendations(title, cosine_sim, unrated_movies):
    # Get the index of the movie that matches the title
    idx = unrated_movies.index[unrated_movies['title'] == title].tolist()[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Filter the movies to only include those that have not been rated by the user
    similar_movies_ids = unrated_movies['movieId'].iloc[movie_indices]
    return pd.DataFrame({'movieId': similar_movies_ids, 'title': unrated_movies['title'].iloc[movie_indices]})


In [None]:
# Extra data cleaning on movies and tags datasets 
movies['genres'] = movies['genres'].astype(str)
tags['tag'] = tags['tag'].astype(str)

# Combine genres and tags 

movies = movies.merge(tags, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')
movies['metadata'] = movies['genres'] + ' ' + movies['tag']

# Create a TF-IDF Vectorizer and transform the metadata to a matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['metadata'])

# Calculate the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on content similarity
# def get_content_based_recommendations(title, cosine_sim=cosine_sim):
#     # Get the index of the movie that matches the title
#     idx = movies.index[movies['title'] == title].tolist()[0]

#     # Get the pairwise similarity scores of all movies with that movie
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     # Get the scores of the 10 most similar movies
#     sim_scores = sim_scores[1:11]

#     # Get the movie indices
#     movie_indices = [i[0] for i in sim_scores]
#     similar_movies_ids = movies['movieId'].iloc[movie_indices]
#     return pd.DataFrame({'movieId': similar_movies_ids, 'title': movies['title'].iloc[movie_indices]})
def get_content_based_recommendations(title, cosine_sim, unrated_movies):
    # Get the index of the movie that matches the title
    idx_list = unrated_movies.index[unrated_movies['title'] == title].tolist()
    
    # Check if index list is empty
    if not idx_list:
        return pd.DataFrame()  # Return empty DataFrame if no match found

    idx = idx_list[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices and validate them
    movie_indices = [i[0] for i in sim_scores if i[0] < len(unrated_movies)]

    # Filter the movies to only include those that have not been rated by the user
    similar_movies_ids = unrated_movies['movieId'].iloc[movie_indices]
    return pd.DataFrame({'movieId': similar_movies_ids, 'title': unrated_movies['title'].iloc[movie_indices]})






## Collaborative merged with trending scores

In [None]:

# Load the ratings dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset
trainset, testset = train_test_split(data, test_size=0.25)

#Chosen algorithm
svd = SVD()
svd.fit(trainset)

# Predict ratings for the testset
predictions = svd.test(testset)

# Integrate trending scores in the recommendation phase
def get_collaborative_recommendations(user_id, trending_scores, top_n=10):
    # Predict ratings for all movies for the given user
    movie_ids = ratings['movieId'].unique()
    predicted_ratings = [svd.predict(user_id, movie_id).est for movie_id in movie_ids]

    # Combine predictions with trending scores
    movie_rating_trend = pd.DataFrame({'movieId': movie_ids, 'predicted_rating': predicted_ratings})
    movie_rating_trend = movie_rating_trend.merge(trending_scores, on='movieId')

    # Calculating a hybrid score
    movie_rating_trend['hybrid_score'] = movie_rating_trend['predicted_rating'] * movie_rating_trend['normalised_score']

    # Get top N recommendations
    top_recommendations = movie_rating_trend.sort_values(by='hybrid_score', ascending=False).head(top_n)
    return top_recommendations

# Example usage
user_id = 2  
trending_scores = weighted_rating_score[['movieId', 'normalised_score']]  
top_recommendations = get_collaborative_recommendations(user_id, trending_scores, top_n=10)


top_recommendations = top_recommendations.merge(movies[['movieId', 'title']], on='movieId')
print(top_recommendations[['title', 'hybrid_score']])


                                                title  hybrid_score
0                                            tomorrow      3.680851
1       when the levees broke: a requiem in four acts      3.679251
2                days of being wild (a fei jingjyuhn)      3.659327
3                                                 tag      3.649597
4                                        shadow world      3.629711
5                      kizumonogatari iii: cold blood      3.626845
6   too funny to fail: the life and death of the d...      3.610003
7               dc super hero girls: hero of the year      3.601991
8   dragon ball z: the return of cooler (doragon b...      3.600709
9                                          deadpool 2      3.597709
10                                         deadpool 2      3.597709
11                                         deadpool 2      3.597709


## combining the 3

In [None]:
def get_unified_recommendations(user_id, ratings, trending_scores, tfidf_matrix, cosine_sim, movies, top_n=10):
    # Filter out movies already rated by the user
    rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].unique()
    unrated_movies = movies[~movies['movieId'].isin(rated_movie_ids)]

    # Get collaborative recommendations
    collaborative_recs = get_collaborative_recommendations(user_id, trending_scores, 5)
    collaborative_recs = collaborative_recs.merge(unrated_movies[['movieId', 'title']], on='movieId', how='left')

    # Initialize an empty DataFrame for content-based recommendations
    content_based_recs = pd.DataFrame()

    for movie_id in collaborative_recs['movieId']:
        if movie_id in rated_movie_ids:  # Skip if the movie is already rated
            continue

        title = unrated_movies[unrated_movies['movieId'] == movie_id]['title'].iloc[0]
        similar_movies = get_content_based_recommendations(title, cosine_sim, unrated_movies)

        if not similar_movies.empty:
            top_similar = similar_movies.head(1)
            if top_similar['movieId'].iloc[0] not in rated_movie_ids:  # Avoid recommending already rated movies
                content_based_recs = pd.concat([content_based_recs, top_similar])

    # Combine collaborative and content-based recommendations
    unified_recommendations = pd.concat([collaborative_recs, content_based_recs]).drop_duplicates(subset=['movieId'])

    # Sort by predicted rating (if available in your data)
    if 'predicted_rating' in unified_recommendations.columns:
        unified_recommendations = unified_recommendations.sort_values(by='predicted_rating', ascending=False)

    # Limit to top N recommendations
    unified_recommendations = unified_recommendations.head(top_n)

    return unified_recommendations

# Example usage
user_id = 4  # Replace with an actual user ID from your dataset
unified_recommendations = get_unified_recommendations(user_id, ratings, trending_scores, tfidf_matrix, cosine_sim, movies, top_n=10)

# Display the recommendations
print(unified_recommendations)


       movieId  predicted_rating  normalised_score  hybrid_score  \
3       179135          3.863665          0.957879      3.700924   
0       117364          3.823615          0.982147      3.755353   
1       193609          3.759400          0.994013      3.736894   
2        95473          3.740229          0.991033      3.706692   
4       193585          3.724638          0.993019      3.698636   
7216     32298               NaN               NaN           NaN   
27          19               NaN               NaN           NaN   
10388   112727               NaN               NaN           NaN   
144        110               NaN               NaN           NaN   
107         77               NaN               NaN           NaN   

                                                   title  
3                                         blue planet ii  
0                                                virunga  
1                           andrew dice clay: dice rules  
2      dragon b

# Splitting Data

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Function to hide some of a user's ratings
def hide_user_ratings(user_id, ratings, hide_ratio=0.25):
    user_ratings = ratings[ratings['userId'] == user_id]
    hide_indices = np.random.choice(user_ratings.index, size=int(len(user_ratings) * hide_ratio), replace=False)
    hidden_ratings = user_ratings.loc[hide_indices]
    visible_ratings = user_ratings.drop(hide_indices)
    return visible_ratings, hidden_ratings

# Function to calculate MAP
def calculate_map(recommendations, hidden_ids):
    relevant = 0
    total_precision = 0
    for k, rec in enumerate(recommendations['movieId'], start=1):
        if rec in hidden_ids:
            relevant += 1
            total_precision += relevant / k
    return total_precision / len(hidden_ids) if hidden_ids else 0

# Function to calculate NDCG
def calculate_ndcg(recommendations, hidden_ids, k=10):
    dcg = sum(1 / np.log2(i + 2) for i, rec in enumerate(recommendations['movieId'].head(k)) if rec in hidden_ids)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(hidden_ids), k)))
    return dcg / idcg if idcg > 0 else 0

# Function to calculate MRR
def calculate_mrr(recommendations, hidden_ids):
    for i, rec in enumerate(recommendations['movieId'], start=1):
        if rec in hidden_ids:
            return 1 / i
    return 0

# Function to calculate precision and recall
def calculate_precision_recall(recommendations, hidden_ratings, k=10):
    recommended_ids = set(recommendations['movieId'].head(k))
    hidden_ids = set(hidden_ratings['movieId'])
    hits = recommended_ids.intersection(hidden_ids)
    precision = len(hits) / len(recommended_ids)
    recall = len(hits) / len(hidden_ids) if hidden_ids else 0
    return precision, recall

# Split users into train and test sets
unique_users = ratings['userId'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.25, random_state=42)

# Initialize lists to store evaluation metrics for each user
precisions = []
recalls = []
mrrs = []
maps = []
ndcgs = []

# Evaluate recommendations for each user in the test set
for user_id in test_users:
    visible_ratings, hidden_ratings = hide_user_ratings(user_id, ratings)
    visible_ratings = pd.concat([ratings[ratings['userId'] != user_id], visible_ratings])

    recommendations = get_unified_recommendations(user_id, visible_ratings, trending_scores, tfidf_matrix, cosine_sim, movies, top_n=10)
    
    # Extract hidden_ids for the current user
    hidden_ids = set(hidden_ratings['movieId'])

    # Calculate metrics
    precision, recall = calculate_precision_recall(recommendations, hidden_ratings, k=10)
    mrr = calculate_mrr(recommendations, hidden_ids)
    map_score = calculate_map(recommendations, hidden_ids)
    ndcg = calculate_ndcg(recommendations, hidden_ids, k=10)

    # Append to lists
    precisions.append(precision)
    recalls.append(recall)
    mrrs.append(mrr)
    maps.append(map_score)
    ndcgs.append(ndcg)

# Calculate average of all metrics across all users
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_mrr = sum(mrrs) / len(mrrs)
avg_map = sum(maps) / len(maps)
avg_ndcg = sum(ndcgs) / len(ndcgs)

print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average MRR: {avg_mrr}')
print(f'Average MAP: {avg_map}')
print(f'Average NDCG: {avg_ndcg}')


Average Precision: 0.012872185911401601
Average Recall: 0.00445609150211172
Average MRR: 0.02051302002282394
Average MAP: 0.002469412758747754
Average NDCG: 0.011087682307154166
