### Import Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datetime import datetime
import ast
import warnings
warnings.filterwarnings('ignore')


### Load Cleaned Data

In [2]:
titles = pd.read_csv('titles_cleaned.csv')
interactions = pd.read_csv('interactions_cleaned.csv')

## Feature Engineering

### Convert String Representation of Lists Back to Lists

In [3]:
def str_to_list(x):
    try:
        return ast.literal_eval(x)
    except:
        return ['Unknown']

multivalued_columns = ['GENRE_TMDB', 'DIRECTOR', 'ACTOR', 'PRODUCER']
for col in multivalued_columns:
    titles[col] = titles[col].apply(str_to_list)


In [4]:
titles

Unnamed: 0,TITLE_ID,ORIGINAL_TITLE,ORIGINAL_LANGUAGE,RELEASE_DURATION_DAYS,GENRE_TMDB,DIRECTOR,ACTOR,PRODUCER
0,tm1282307,L'ultima notte di Amore,it,484.0,"[drama, thriller]",[Andrea Di Stefano],"[Pierfrancesco Favino, Linda Caridi, Antonio G...","[Benedetto Habib, Daniel Campos Pavoncelli, Fa..."
1,tm1338500,Bird Box Barcelona,es,357.0,"[horror, scifi, thriller]","[David Pastor, Àlex Pastor]","[Mario Casas, Georgina Campbell, Diego Calva, ...","[Adrián Guerra, Chris Morgan, Dylan Clark, Núr..."
2,ts371824,Steeltown Murders,en,417.0,"[crime, drama, history, thriller]",[Marc Evans],"[Scott Arthur, Sion Alun Davies, Keith Allen, ...",[Hannah Thomas]
3,tm123363,Expend4bles,en,294.0,"[action, thriller, war]",[Scott Waugh],"[Jason Statham, Sylvester Stallone, 50 Cent, M...","[Jason Statham, Jeffrey Greenstein, Jonathan Y..."
4,tm1045025,65,en,491.0,"[action, drama, scifi, thriller]","[Bryan Woods, Scott Beck]","[Adam Driver, Ariana Greenblatt, Chloe Coleman...","[Bryan Woods, Deborah Liebling, Sam Raimi, Sco..."
...,...,...,...,...,...,...,...,...
20624,ts21325,Hunter,en,14535.0,"[action, crime, drama, thriller]","[David Soul, Tony Mordente, Gus Trikonis, Jame...","[Fred Dryer, Stepfanie Kramer, Charles Hallaha...","[Frank Lupo, Fred Dryer, George Geiger, Lawren..."
20625,tm1382322,Strange Darling,en,3720.0,"[horror, thriller]",[JT Mollner],"[Willa Fitzgerald, Kyle Gallner, Jason Patric,...","[Bill Block, Giovanni Ribisi, Roy Lee, Steven ..."
20626,ts21242,Mission: Impossible,en,21111.0,"[action, crime, drama, thriller]","[Tom Gries, Leonard J. Horn, Seymour Robbie, H...","[Peter Graves, Greg Morris, Peter Lupus, Bob J...","[Allan Balter, Barry Crane, Joseph Gantman, Ro..."
20627,ts37497,Popeye the Sailor,en,23401.0,"[animation, comedy, family, romance]","[Jack Kinney, Paul Fennell, Bob Bemiller, Tom ...","[Jack Mercer, Mae Questel, Jackson Beck]",[Al Brodax]


### Handle Less Frequent Categories
We'll group less frequent items as 'Other' to reduce dimensionality.

In [5]:
def get_top_items(column, min_count):
    all_items = titles.explode(column)[column]
    item_counts = all_items.value_counts()
    top_items = item_counts[item_counts >= min_count].index.tolist()
    return top_items

# Thresholds
director_min_count = 5
actor_min_count = 10
producer_min_count = 5

# Get top items
top_directors = get_top_items('DIRECTOR', director_min_count)
top_actors = get_top_items('ACTOR', actor_min_count)
top_producers = get_top_items('PRODUCER', producer_min_count)


#### Replace Less Frequent Items

In [6]:
def replace_less_frequent(items, top_items):
    return [item if item in top_items else 'other' for item in items]

# Apply the function
titles['DIRECTOR'] = titles['DIRECTOR'].apply(lambda x: replace_less_frequent(x, top_directors))
titles['ACTOR'] = titles['ACTOR'].apply(lambda x: replace_less_frequent(x, top_actors))
titles['PRODUCER'] = titles['PRODUCER'].apply(lambda x: replace_less_frequent(x, top_producers))


### Create Combined Feature ('SOUP')

In [7]:
titles['SOUP'] = titles['ORIGINAL_TITLE'] + ' ' + \
                 titles['GENRE_TMDB'].apply(lambda x: ' '.join(x)) + ' ' + \
                 titles['DIRECTOR'].apply(lambda x: ' '.join(x)) + ' ' + \
                 titles['ACTOR'].apply(lambda x: ' '.join(x)) + ' ' + \
                 titles['PRODUCER'].apply(lambda x: ' '.join(x))

## Content-Based Filtering using Soup Features
### Version 1: Optimized
We will optimize the content-based filtering using the following techniques:

Limiting Vocabulary Size
Using HashingVectorizer
Applying TruncatedSVD

#### Vectorization with Limited Vocabulary Size

In [8]:
# Limit the vocabulary size to 5000 most frequent words
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(titles['SOUP'])
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)


TF-IDF Matrix Shape: (20629, 5000)


#### Dimensionality Reduction with TruncatedSVD

In [9]:
from sklearn.decomposition import TruncatedSVD

# Apply TruncatedSVD to reduce dimensions
svd = TruncatedSVD(n_components=100, random_state=42)
tfidf_matrix_svd = svd.fit_transform(tfidf_matrix)
print(f"Reduced TF-IDF Matrix Shape: {tfidf_matrix_svd.shape}")

cosine_sim_svd = cosine_similarity(tfidf_matrix_svd, tfidf_matrix_svd)

Reduced TF-IDF Matrix Shape: (20629, 100)


## Approach 2: Separate Features Similarity Search

### Vectorization of Individual Features

In [10]:
# Genres
mlb_genre = MultiLabelBinarizer()
genre_matrix = mlb_genre.fit_transform(titles['GENRE_TMDB'])

# Directors
mlb_director = MultiLabelBinarizer()
director_matrix = mlb_director.fit_transform(titles['DIRECTOR'])

# Actors
mlb_actor = MultiLabelBinarizer()
actor_matrix = mlb_actor.fit_transform(titles['ACTOR'])

# Producers
mlb_producer = MultiLabelBinarizer()
producer_matrix = mlb_producer.fit_transform(titles['PRODUCER'])

# Original Title
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
title_matrix = count_vectorizer.fit_transform(titles['ORIGINAL_TITLE'])


In [11]:
def reduce_dimensionality(matrix, n_components=10):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    return svd.fit_transform(matrix)

# Reduce dimensions
director_matrix_reduced = reduce_dimensionality(director_matrix)
actor_matrix_reduced = reduce_dimensionality(actor_matrix)
producer_matrix_reduced = reduce_dimensionality(producer_matrix)
title_matrix_reduced = reduce_dimensionality(title_matrix)


### Compute Similarity Matrices

In [None]:
# Compute similarity for each feature
genre_sim = cosine_similarity(genre_matrix)
director_sim = cosine_similarity(director_matrix_reduced)
actor_sim = cosine_similarity(actor_matrix_reduced)
producer_sim = cosine_similarity(producer_matrix_reduced)
title_sim = cosine_similarity(title_matrix_reduced)


# Define weights
w_genre = 0.5
w_director = 0.2
w_actor = 0.2
w_producer = 0.1
w_title = 0.1

# Ensure weights sum to 1
total_weight = w_genre + w_director + w_actor + w_producer + w_title
w_genre /= total_weight
w_director /= total_weight
w_actor /= total_weight
w_producer /= total_weight
w_title /= total_weight

# Combine similarity matrices
cosine_sim_separate = (w_genre * genre_sim +
                       w_director * director_sim +
                       w_actor * actor_sim +
                       w_producer * producer_sim +
                       w_title * title_sim)


## Collaborative Filtering
We will implement collaborative filtering using the Surprise library, ensuring a time-wise train-test split to prevent data leakage.

### Prepare Data
Assign Ratings Based on Interaction Types

In [None]:
# Map interaction types to ratings
interaction_weights = {
    'likelist_addition': 3,
    'seenlist_addition': 2,
    'watchlist_addition': 1,
    'clickout_provider': 0.5
}

interactions['RATING'] = interactions['INTERACTION_TYPE'].map(interaction_weights)
interactions.dropna(subset=['RATING'], inplace=True)
interactions['RATING'] = interactions['RATING'].astype(float)


In [None]:
# Convert 'COLLECTOR_TSTAMP' to datetime
interactions['COLLECTOR_TSTAMP'] = pd.to_datetime(interactions['COLLECTOR_TSTAMP'])

# Sort data by timestamp
interactions_sorted = interactions.sort_values('COLLECTOR_TSTAMP')

# Define cutoff date (e.g., last 20% of data for testing)
cutoff_date = interactions_sorted['COLLECTOR_TSTAMP'].quantile(0.8)

# Split data
train_data = interactions_sorted[interactions_sorted['COLLECTOR_TSTAMP'] <= cutoff_date]
test_data = interactions_sorted[interactions_sorted['COLLECTOR_TSTAMP'] > cutoff_date]


In [None]:
from surprise import Dataset, Reader, SVD, accuracy

# Prepare the data
reader = Reader(rating_scale=(0.5, 3))
train_dataset = Dataset.load_from_df(train_data[['BE_ID', 'TITLE_ID', 'RATING']], reader)
trainset = train_dataset.build_full_trainset()

# Prepare testset
testset = list(zip(test_data['BE_ID'], test_data['TITLE_ID'], test_data['RATING']))


In [None]:
# Initialize and train the SVD algorithm
algo = SVD(random_state=42)
algo.fit(trainset)


In [None]:
# Test the algorithm
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse:.4f}")


In [None]:
# Get unique users and items
unique_users = interactions['BE_ID'].unique()
unique_items = interactions['TITLE_ID'].unique()

def get_svd_recommendations(user_id, n=10):
    # Get items the user has interacted with in the training set
    interacted_items = set(train_data[train_data['BE_ID'] == user_id]['TITLE_ID'])
    
    # Predict ratings for all items not yet interacted with
    items_to_predict = [iid for iid in unique_items if iid not in interacted_items]
    predictions = [algo.predict(user_id, iid) for iid in items_to_predict]
    
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommendations
    top_n = predictions[:n]
    top_n_iids = [pred.iid for pred in top_n]
    
    # Return titles of recommended items
    recommended_titles = titles[titles['TITLE_ID'].isin(top_n_iids)]
    return recommended_titles[['TITLE_ID', 'ORIGINAL_TITLE', 'GENRE_TMDB']]


In [None]:
indices = pd.Series(titles.index, index=titles['TITLE_ID']).drop_duplicates()


## Evaluation Metrics

In [None]:
def precision_at_k(recommended_items, relevant_items, k):
    recommended_k = recommended_items[:k]
    relevant_set = set(relevant_items)
    recommended_set = set(recommended_k)
    intersection = recommended_set.intersection(relevant_set)
    precision = len(intersection) / k
    return precision

def recall_at_k(recommended_items, relevant_items, k):
    recommended_k = recommended_items[:k]
    relevant_set = set(relevant_items)
    recommended_set = set(recommended_k)
    intersection = recommended_set.intersection(relevant_set)
    recall = len(intersection) / len(relevant_set) if len(relevant_set) > 0 else 0
    return recall

def f1_at_k(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)


### Prepare Data for Evaluation

In [None]:
# Consider 'likelist_addition' and 'seenlist_addition' as positive interactions
positive_interactions = interactions[interactions['INTERACTION_TYPE'].isin(['likelist_addition', 'seenlist_addition'])]

# Create user-item interaction matrix
user_item_matrix = positive_interactions.groupby('BE_ID')['TITLE_ID'].apply(list).reset_index()

# Time-wise train-test split
positive_interactions['COLLECTOR_TSTAMP'] = pd.to_datetime(positive_interactions['COLLECTOR_TSTAMP'])
positive_interactions = positive_interactions.sort_values('COLLECTOR_TSTAMP')

cutoff_date = positive_interactions['COLLECTOR_TSTAMP'].quantile(0.8)

train_data = positive_interactions[positive_interactions['COLLECTOR_TSTAMP'] <= cutoff_date]
test_data = positive_interactions[positive_interactions['COLLECTOR_TSTAMP'] > cutoff_date]

# Create dictionaries for train and test interactions
train_interactions = train_data.groupby('BE_ID')['TITLE_ID'].apply(set).to_dict()
test_interactions = test_data.groupby('BE_ID')['TITLE_ID'].apply(set).to_dict()


In [None]:
indices = pd.Series(titles.index, index=titles['TITLE_ID']).drop_duplicates()


In [None]:
def get_recommendations(title_id, cosine_sim):
    if title_id not in indices:
        return []
    idx = indices[title_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude itself
    title_indices = [i[0] for i in sim_scores]
    recommended_titles = titles.iloc[title_indices]['TITLE_ID'].tolist()
    return recommended_titles

def evaluate_algorithm(cosine_sim, algorithm_name):
    k = 10
    precision_list = []
    recall_list = []
    f1_list = []

    # Evaluate on a sample of users
    sample_users = list(test_interactions.keys())[:1000]  # Limit for computational purposes
    for user in sample_users:
        relevant_items = test_interactions[user]
        recommended_items = []
        # For each title the user interacted with in test data, get recommendations
        for title_id in relevant_items:
            recs = get_recommendations(title_id, cosine_sim)
            recommended_items.extend(recs)
        # Remove duplicates
        recommended_items = list(set(recommended_items))
        if recommended_items:
            precision = precision_at_k(recommended_items, relevant_items, k)
            recall = recall_at_k(recommended_items, relevant_items, k)
            f1 = f1_at_k(precision, recall)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

    # Compute average metrics
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    print(f"Evaluation Results for {algorithm_name}:")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1-Score@{k}: {avg_f1:.4f}")
    print("\n")


def evaluate_svd(algorithm_name):
    k = 10
    precision_list = []
    recall_list = []
    f1_list = []

    sample_users = list(test_interactions.keys())[:1000]  # Limit for computational purposes
    for user in sample_users:
        relevant_items = test_interactions[user]
        recommended_titles = get_svd_recommendations(user)['TITLE_ID'].tolist()
        if recommended_titles:
            precision = precision_at_k(recommended_titles, relevant_items, k)
            recall = recall_at_k(recommended_titles, relevant_items, k)
            f1 = f1_at_k(precision, recall)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

    # Compute average metrics
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    print(f"Evaluation Results for {algorithm_name}:")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1-Score@{k}: {avg_f1:.4f}")
    print("\n")


In [None]:
# Evaluate Content-Based Filtering using Soup Features
evaluate_algorithm(cosine_sim_tfidf, "Content-Based (Soup Features-TFIDF)")

evaluate_algorithm(cosine_sim_svd, "Content-Based (Soup Features-SVD)")

# Evaluate Content-Based Filtering using Separate Features
evaluate_algorithm(cosine_sim_separate, "Content-Based (Separate Features)")

# Evaluate Collaborative Filtering
evaluate_svd("Collaborative Filtering (SVD)")


In [None]:
title_id = 'tm107473'

# Using Content-Based (Separate Features)
recs_separate = get_recommendations(title_id, cosine_sim_separate)
print(f"Recommendations for Title ID {title_id} using Content-Based (Separate Features):")
display(titles[titles['TITLE_ID'].isin(recs_separate)][['TITLE_ID', 'ORIGINAL_TITLE', 'GENRE_TMDB']])

# Using Collaborative Filtering (if user ID is known)
user_id = test_data['BE_ID'].iloc[0]  # Replace with a valid user ID if available
recs_svd = get_svd_recommendations(user_id)
print(f"Recommendations for User ID {user_id} using Collaborative Filtering:")
display(recs_svd)


In [None]:
title_id = 'tm50355'

# Using Content-Based (Separate Features)
recs_separate = get_recommendations(title_id, cosine_sim_separate)
print(f"Recommendations for Title ID {title_id} using Content-Based (Separate Features):")
display(titles[titles['TITLE_ID'].isin(recs_separate)][['TITLE_ID', 'ORIGINAL_TITLE', 'GENRE_TMDB']])

# Using Collaborative Filtering (if user ID is known)
user_id = test_data['BE_ID'].iloc[1]  # Replace with a valid user ID if available
recs_svd = get_svd_recommendations(user_id)
print(f"Recommendations for User ID {user_id} using Collaborative Filtering:")
display(recs_svd)


In [None]:
title_id = 'ts89259'

# Using Content-Based (Separate Features)
recs_separate = get_recommendations(title_id, cosine_sim_separate)
print(f"Recommendations for Title ID {title_id} using Content-Based (Separate Features):")
display(titles[titles['TITLE_ID'].isin(recs_separate)][['TITLE_ID', 'ORIGINAL_TITLE', 'GENRE_TMDB']])

# Using Collaborative Filtering (if user ID is known)
user_id = test_data['BE_ID'].iloc[2]  # Replace with a valid user ID if available
recs_svd = get_svd_recommendations(user_id)
print(f"Recommendations for User ID {user_id} using Collaborative Filtering:")
display(recs_svd)
