In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
movies = pd.read_csv("../data/processed/cleaned_movies.csv")
links = pd.read_csv('../data/processed/cleaned_links.csv')
ratings = pd.read_csv('../data/processed/cleaned_ratings.csv')
tags = pd.read_csv('../data/processed/cleaned_tags.csv')

In [3]:
df =  pd.read_csv("../data/processed/combined_movie_data.csv")

In [4]:
df.head()

Unnamed: 0,movieId,title,genres,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Fantasy,genre_Romance,genre_Drama,...,avg_normalized_rating,normalized_rating_std,first_rating_year,last_rating_year,all_tags,num_unique_tags,imdbId,tmdbId,has_tags,rating_popularity
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,0,...,0.693023,0.23381,1996.0,2018.0,pixar|pixar|fun,2.0,114709,862,1,0.652439
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,0,...,0.555844,0.241186,1996.0,2018.0,game|Robin Williams|fantasy|magic board game,4.0,113497,8844,1,0.332317
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,0,...,0.516484,0.271574,1996.0,2017.0,old|moldy,2.0,113228,15602,1,0.155488
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,1,...,0.265306,0.209121,1996.0,2009.0,no_tags,0.0,114885,31357,0,0.018293
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,0,...,0.45481,0.245146,1996.0,2018.0,pregnancy|remake,2.0,113041,11862,1,0.146341


# Content-Based Filtering

Preprocess the genres

In [5]:
# Preprocess the genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres'].str.split('|'))

Fill NaN in numeric columns with median (for robustness)

In [6]:
df['average_rating'] = df['average_rating'].fillna(df['average_rating'].median())
df['rating_count'] = df['rating_count'].fillna(df['rating_count'].median())
df['release_year'] = df['release_year'].fillna(df['release_year'].median())
df['last_rating_year'] = df['last_rating_year'].fillna(df['last_rating_year'].median())

Preprocess the average_rating and rating_count

In [7]:
# simple version
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['average_rating', 'rating_count']])

In [8]:
# advanced version
from datetime import datetime

# Log transformation (handle skewed rating count)
df['rating_count_log'] = np.log1p(df['rating_count'])  # log1p avoids log(0) issues
# Normalize log-transformed count
scaler = StandardScaler()
df['rating_count_scaled'] = scaler.fit_transform(df[['rating_count_log']])

# Compute Bayesian Average Rating
C = df['average_rating'].mean()  # Mean rating across all movies
m = df['rating_count'].quantile(0.50)  # Only consider movies with at least median count
df['bayesian_avg_rating'] = ((df['rating_count'] * df['average_rating']) + (m * C)) / (df['rating_count'] + m)

# Calculate rating recency (years since last rating)
current_year = datetime.now().year
df['rating_recency'] = current_year - df['last_rating_year']
# Normalize it
df['rating_recency_scaled'] = scaler.fit_transform(df[['rating_recency']])

# Weighted popularity score
df['popularity_score'] = df['average_rating'] * np.log1p(df['rating_count'])
# Normalize it
df['popularity_scaled'] = scaler.fit_transform(df[['popularity_score']])


Preprocess the Release Year

In [9]:
# Ensure 'Release Year' is numeric
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
# Normalize the release year
scaler_year = MinMaxScaler()
df['release_year_scaled'] = scaler.fit_transform(df[['release_year']])

Preprocess the Tags

In [10]:
# use Word2Vec TF-IDF

# Fill missing tags with an empty string
df['all_tags'] = df['all_tags'].fillna('')
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=100)  # Limit to 100 features for efficiency
# Transform the tags column into numerical features
tag_matrix = tfidf.fit_transform(df['all_tags'])  # Convert tags into TF-IDF features
# Convert to dense array (optional)
tag_matrix = tag_matrix.toarray()

In [11]:
# use Word2Vec
from gensim.models import Word2Vec

df['Tag_List'] = df['all_tags'].apply(lambda x: x.split(','))

# Train Word2Vec model
w2v_model = Word2Vec(df['Tag_List'], vector_size=50, window=5, min_count=1, workers=4)

# Create tag vectors for each movie
def get_tag_vector(tags):
    vectors = [w2v_model.wv[tag] for tag in tags if tag in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(50)  # 50 is vector size

# Apply the function to generate tag embeddings
tag_vectors = np.vstack(df['Tag_List'].apply(get_tag_vector))

Combine features

In [12]:
features = np.hstack((
    genre_matrix, 
    tag_vectors, 
    df[['bayesian_avg_rating', 'rating_count_scaled', 'rating_recency_scaled', 'popularity_scaled', 'release_year_scaled']].values,
))

Train model

In [13]:
# KNN
knn = NearestNeighbors(n_neighbors=10, metric='cosine')  # Increased n_neighbors
knn.fit(features)

In [14]:
def get_recommendations_by_id(movie_id, df, knn_model, features, num_recommendations=5):
    """
    Retrieve recommended movies similar to the given movie using its unique ID.

    Parameters:
        movie_id (int): ID of the movie for which recommendations are needed.
        df (pd.DataFrame): DataFrame containing movie information with a 'movieId' column.
        knn_model (NearestNeighbors): Trained KNN model.
        features (np.array): Feature matrix.
        num_recommendations (int): Number of recommendations to return.

    Returns:
        list: Recommended movie titles.
    """
    # Get movie index using the movie id
    movie_idx = df[df['movieId'] == movie_id].index.tolist()
    if not movie_idx:
        print("Movie not found!")
        return []
    
    movie_idx = movie_idx[0]
    
    # Find similar movies using KNN
    distances, indices = knn_model.kneighbors(features[movie_idx].reshape(1, -1))
    
    # Get recommended movies (excluding the movie itself)
    recommended_movies = df.iloc[indices.flatten()[1:num_recommendations + 1]]['title'].tolist()
    
    return recommended_movies


In [15]:
def calculate_predicted_score(movie_idx, df, knn_model, features):
    """
    Calculate the predicted rating for a movie based on its nearest neighbors.

    Parameters:
        movie_idx (int): Index of the movie in the dataframe.
        df (pd.DataFrame): DataFrame containing movie information.
        knn_model (NearestNeighbors): Trained KNN model.
        features (np.array): Feature matrix.

    Returns:
        float: Predicted rating for the given movie.
    """
    distances, indices = knn_model.kneighbors(features[movie_idx].reshape(1, -1))

    # Extract neighbor ratings (excluding itself)
    neighbor_ratings = df.iloc[indices.flatten()[1:]]['average_rating'].values
    similarities = 1 - distances.flatten()[1:]  # Convert cosine distance to similarity

    # Compute weighted average rating
    weighted_sum = np.sum(similarities * neighbor_ratings)
    sum_of_weights = np.sum(np.abs(similarities))

    # Return predicted rating (fallback to movie's original rating if no weights)
    return weighted_sum / sum_of_weights if sum_of_weights != 0 else df.iloc[movie_idx]['average_rating']

In [16]:
movie_name = "Toy Story (1995)"

In [17]:
# Suppose you want recommendations for the movie with ID 1
movie_id = 1
recommendations = get_recommendations_by_id(movie_id, df, knn, features, num_recommendations=10)
print(f"Recommendations for movie ID {movie_id}:\n {recommendations}")

Recommendations for movie ID 1:
 ['Monsters, Inc. (2001)', 'Toy Story 2 (1999)', 'Shrek (2001)', 'Finding Nemo (2003)', "Bug's Life, A (1998)", 'Ice Age (2002)', 'Aladdin (1992)', "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)", 'Jumanji (1995)']


In [18]:
predicted_score = calculate_predicted_score(df[df['title'] == movie_name].index[0], df, knn, features)
print(f"Predicted Score for '{movie_name}':", predicted_score)

Predicted Score for 'Toy Story (1995)': 3.760774057193294


# Save the Model

In [19]:
# with word2vec
import os
import numpy as np
import pickle
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors

# Define save path
save_path = "../models/word2vec"
os.makedirs(save_path, exist_ok=True)  # Create directory if it doesn't exist

# Save Word2Vec model
w2v_model.save(os.path.join(save_path, "word2vec.model"))

# Save tag vectors
np.save(os.path.join(save_path, "tag_vectors.npy"), tag_vectors)

# Save KNN model
with open(os.path.join(save_path, "knn_model.pkl"), "wb") as f:
    pickle.dump(knn, f)

# Save DataFrame structure (for restoring index-movie mapping)
df[['movieId', 'title', 'average_rating', 'rating_count']].to_csv(
    os.path.join(save_path, "movies_metadata.csv"), index=False
)

# Save combined feature matrix
np.save(os.path.join(save_path, "features.npy"), features)


In [20]:
save_path = "../models/"
os.makedirs(save_path, exist_ok=True)  # Create directory if it doesn't exist

In [21]:


# Exporting the model and additional components for production
import pickle

with open(f"{save_path}/knn_model.pkl", 'wb') as f:
    pickle.dump(knn, f)

with open(f"{save_path}/tfidf_model.pkl", 'wb') as f:
    pickle.dump(tfidf, f)

with open(f"{save_path}/tag_vectors.pkl", 'wb') as f:
    pickle.dump(tag_vectors, f)

with open(f"{save_path}/features.pkl", 'wb') as f:
    pickle.dump(features, f)