In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.impute import SimpleImputer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import jaccard_score
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from scipy.sparse import csr_matrix
import warnings; warnings.simplefilter('ignore')

In [47]:
md = pd. read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [48]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [49]:
# Treating genre
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [50]:
numerical_columns = md.select_dtypes(include=['number']).columns.tolist()
print(numerical_columns)

['revenue', 'runtime', 'vote_average', 'vote_count']


In [51]:
object_cols = md.select_dtypes(include=['object']).columns.tolist()
print(object_cols)

['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'spoken_languages', 'status', 'tagline', 'title', 'video']


In [52]:
#filling the missing values with unknown
md[object_cols] = md[object_cols].fillna('unknown')

In [53]:
# replacing null vlaues with mean value for numerical columns
imputer = SimpleImputer(strategy='mean')
md[numerical_columns] = imputer.fit_transform(md[numerical_columns])

In [54]:
md.isnull().sum()

adult                    0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
id                       0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
video                    0
vote_average             0
vote_count               0
dtype: int64

In [55]:
md['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

## Average vote count

In [56]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244864294197862

In [57]:
m = vote_counts.quantile(0.95)
m

433.75

In [58]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [59]:
# qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified = md[(md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(45466, 6)

In [60]:
def wr(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [61]:
qualified['wr'] = qualified.apply(wr, axis=1)
Top_movies = qualified.sort_values('wr', ascending=False).head(250)

## Moive Analysis

### Top Movies

In [62]:
Top_movies.head(20)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917633
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905923
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897163
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881817
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871856
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.868731
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864073
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.862001
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.86073
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.852003


In [63]:

qualified['popularity'] = pd.to_numeric(qualified['popularity'], errors='coerce')
qualified['vote_count'] = pd.to_numeric(qualified['vote_count'], errors='coerce')

qualified.dropna(subset=['popularity', 'vote_count'], inplace=True)

# Sort by popularity
top_movies_by_popularity = qualified.sort_values(by='popularity', ascending=False)

# Sort by vote count
top_movies_by_vote_count = qualified.sort_values(by='vote_count', ascending=False)

print("Top movies by popularity:")
print(top_movies_by_popularity[['title', 'popularity']].head())

Top movies by popularity:
                      title  popularity
30700               Minions  547.488298
33356          Wonder Woman  294.337037
42222  Beauty and the Beast  287.253654
43644           Baby Driver  228.032744
24455            Big Hero 6  213.849907


In [64]:
print("\nTop movies by vote count:")
print(top_movies_by_vote_count[['title', 'vote_count']].head())


Top movies by vote count:
                 title  vote_count
15480        Inception       14075
12481  The Dark Knight       12269
14551           Avatar       12114
17818     The Avengers       12000
26564         Deadpool       11444


## Genre Analysis

### Most Frequent Genre

In [65]:
all_genres = [genre for sublist in md['genres'] for genre in sublist]
genre_counts = Counter(all_genres)

# Find the most frequent genre
most_frequent_genre = genre_counts.most_common(1)[0]

print(f"The most frequent genre is '{most_frequent_genre[0]}' with {most_frequent_genre[1]} occurrences.")

The most frequent genre is 'Drama' with 20265 occurrences.


### Most Popular Genre

In [66]:
md['popularity'] = pd.to_numeric(md['popularity'], errors='coerce')
md.dropna(subset=['popularity'], inplace=True)

# Create a new DataFrame to store genres with their corresponding popularity
genre_popularity = []

for i, row in md.iterrows():
    for genre in row['genres']:
        genre_popularity.append({'genre': genre, 'popularity': row['popularity']})

genre_popularity_df = pd.DataFrame(genre_popularity)
genre_popularity_df['popularity'] = pd.to_numeric(genre_popularity_df['popularity'], errors='coerce')
genre_popularity_df.dropna(subset=['popularity'], inplace=True)
genre_popularity_sum = genre_popularity_df.groupby('genre')['popularity'].sum()

# Find the most popular genre
most_popular_genre = genre_popularity_sum.idxmax()
most_popular_genre_popularity = genre_popularity_sum.max()

print(f"The most popular genre is '{most_popular_genre}' with a total popularity of {most_popular_genre_popularity}.")

The most popular genre is 'Drama' with a total popularity of 61025.144194.


In [67]:
qualified=qualified.iloc[:len(qualified)//2]
qualified['genres'] = qualified['genres'].apply(lambda x: ','.join(x))

In [68]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
0,Toy Story,1995,5415,7,21.946943,"Animation,Comedy,Family",6.869837
1,Jumanji,1995,2413,6,17.015539,"Adventure,Fantasy,Family",5.884942
2,Grumpier Old Men,1995,92,6,11.7129,"Romance,Comedy",5.377004
3,Waiting to Exhale,1995,34,6,3.859495,"Comedy,Drama,Romance",5.299754
4,Father of the Bride Part II,1995,173,5,8.387519,Comedy,5.175047


### Content Based Filtering

In [69]:
# TF-IDF Vectorizer for genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_genres = tfidf.fit_transform(qualified['genres'].fillna(''))

# Compute cosine similarity for genres
cosine_sim_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)

In [70]:
qualified_10 = qualified.head(1000)
# Function to generate recommendations based on cosine similarity
def user_interactions(title, qualified, sim_matrix, top_n=10):
    # Check if title exists in the DataFrame
    if title not in qualified['title'].values:
        return []
    
    # Find the index of the movie
    idx = qualified.index[qualified['title'] == title].tolist()[0]

    # Get the similarity scores for the movie
    sim_scores = sim_matrix[idx]

    # Sort the movies based on the similarity scores
    movie_indices_scores = list(enumerate(sim_scores))
    sorted_movies = sorted(movie_indices_scores, key=lambda x: x[1], reverse=True)

    # Exclude the movie itself
    top_movies = sorted_movies[1:top_n+1]

    # Get the indices of the top similar movies
    movie_indices = [i for i, _ in top_movies]
    
    # Ensure indices are within the bounds of the DataFrame
    movie_indices = [i for i in movie_indices if i < len(qualified)]

    if not movie_indices:
        return []

    # Get the titles of the top similar movies
    similar_movies = qualified.iloc[movie_indices]['title'].tolist()
    
    return similar_movies

# Automatically generate the user_interactions_dict for the first 10 movies
user_interactions_dict = {}
for title in qualified_10['title']:
    user_interactions_dict[title] = user_interactions(title, qualified_10, cosine_sim_genres, top_n=10)

# Convert to DataFrame
user_interactions_df = pd.DataFrame(list(user_interactions_dict.items()), columns=['Movie', 'Recommendations'])


In [71]:

# Function to calculate similarity based on popularity and weighted rating
def calculate_numerical_similarity(df):
    # Normalize popularity and wr
    df['popularity_norm'] = (df['popularity'] - df['popularity'].min()) / (df['popularity'].max() - df['popularity'].min())
    df['wr_norm'] = (df['wr'] - df['wr'].min()) / (df['wr'].max() - df['wr'].min())
    
    # Calculate Pearson correlation coefficient between popularity and wr
    pearson_corr = pearsonr(df['popularity_norm'], df['wr_norm'])[0]
    
    # Adjust to similarity score (1 - correlation to get higher values for higher similarity)
    similarity_score = 1 - abs(pearson_corr)
    return similarity_score

In [72]:
# Compute similarity scores based on genres, popularity, and weighted rating
def compute_similarity_matrix(qualified):
    num_movies = len(qualified['title'])
    sim_matrix = np.zeros((num_movies, num_movies))
    
    # Compute genre similarity
    sim_matrix += cosine_sim_genres
    
    # Compute numerical (popularity and weighted rating) similarity
    numerical_similarity = calculate_numerical_similarity(qualified)
    sim_matrix += numerical_similarity
    
    return sim_matrix

In [73]:
def get_recommendations(title, qualified, sim_matrix, top_n=3):
    # Check if title exists in the dataset
    if title in qualified['title'].values:
        # Get the index of the movie that matches the title
        idx = qualified.index[qualified['title'] == title].tolist()[0]
        
        # Check if idx is within the bounds of sim_matrix
        if idx >= len(sim_matrix) or idx < 0:
            return "Invalid index or index out of bounds for sim_matrix."
        
        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = sim_matrix[idx]
        
        # Enumerate through the similarity scores and their indices
        movie_indices_scores = list(enumerate(sim_scores))
        
        # Sort movies based on the similarity scores in descending order
        movie_indices_scores_sorted = sorted(movie_indices_scores, key=lambda x: x[1], reverse=True)
        
        # Exclude the movie itself from recommendations
        similar_movies = movie_indices_scores_sorted[1:]
        
        # Get top n similar movies
        top_similar_movies = similar_movies[:top_n]
        
        # Get movie indices from the top n similar movies
        movie_indices = [i for i, _ in top_similar_movies]
        
        # Return the top n most similar movie titles
        recommendations = qualified.iloc[movie_indices]['title']
        
        if len(recommendations) == 0:
            return "No recommendations found."
        
        return recommendations.values.tolist()
    
    else:
        print(f"Movie title '{title}' not found in dataset. Recommending top {top_n} movies instead.")
        
        # Get top n movies based on some criteria (e.g., popularity, rating)
        top_movies = qualified.sort_values(by='popularity', ascending=False).head(top_n)['title']
        
        return top_movies.tolist()



In [74]:
similarity_matrix = compute_similarity_matrix(qualified)
print("Recommendations for 'Toy Story':")
print(get_recommendations('Toy Story', qualified, similarity_matrix))

Recommendations for 'Toy Story':
['Oliver & Company', 'A Close Shave', 'The Wrong Trousers']


In [75]:
def create_actual_ratings_dict(qualified, recommendations):
    # Create a dictionary mapping movie titles to their 'wr' values
    wr_dict = qualified.set_index('title')['wr'].to_dict()
    
    # Initialize the actual ratings dictionary
    actual_ratings_dict = {}
    
    for movie, recs in recommendations.items():
        # Create a dictionary of recommended movies with their 'wr' values
        actual_ratings_dict[movie] = {rec: wr_dict.get(rec, 0) for rec in recs if rec in wr_dict}
    
    return actual_ratings_dict

recommendations = {}
titles = qualified['title'].tolist()

for idx, title in enumerate(titles):
    # Get similarity scores for the current movie
    sim_scores = list(enumerate(similarity_matrix[idx]))
    # Sort movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the top recommendations (excluding the movie itself)
    top_recs = [titles[i] for i, score in sim_scores if titles[i] != title][:5]
    recommendations[title] = top_recs
    


In [76]:
# Generate the actual ratings dictionary
actual_ratings_dict = create_actual_ratings_dict(qualified, recommendations)

In [77]:
import numpy as np

def calculate_accuracy(user_interactions_dict, qualified, sim_matrix, actual_ratings_dict, top_n=10):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    total_squared_error = 0
    num_ratings = 0

    for movie, actual_recommendations in user_interactions_dict.items():
        # Get recommendations for the movie
        recommended_movies = get_recommendations(movie, qualified, sim_matrix, top_n=top_n)
        
        # Compute true positives, false positives, and false negatives
        actual_set = set(actual_recommendations)
        recommended_set = set(recommended_movies)
        
        true_positives += len(actual_set.intersection(recommended_set))
        false_positives += len(recommended_set - actual_set)
        false_negatives += len(actual_set - recommended_set)
        
        # Compute RMSE
        if movie in actual_ratings_dict:
            predicted_ratings = [qualified.loc[qualified['title'] == rec, 'wr'].values[0] if not qualified.loc[qualified['title'] == rec, 'wr'].empty else 0 for rec in recommended_movies]
            actual_ratings = [actual_ratings_dict[movie].get(rec, 0) for rec in recommended_movies]
            
            if actual_ratings:
                squared_errors = [(p - a) ** 2 for p, a in zip(predicted_ratings, actual_ratings)]
                total_squared_error += sum(squared_errors)
                num_ratings += len(actual_ratings)

    # Calculate precision, recall, F1 score, accuracy, and RMSE
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0
    rmse = np.sqrt(total_squared_error / num_ratings) if num_ratings > 0 else 0
    
    return precision, recall, f1_score, accuracy, rmse


precision, recall, f1_score, accuracy, rmse = calculate_accuracy(user_interactions_dict, qualified, similarity_matrix, actual_ratings_dict, top_n=10)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"RMSE: {rmse:.2f}")


Precision: 0.63
Recall: 1.00
F1 Score: 0.78
Accuracy: 0.63
RMSE: 4.28


In [78]:
# clear similarity metrix
del similarity_matrix
del cosine_sim_genres

## Item-Based Collaborative Filtering

In [79]:
qualified = qualified.head(1000)

In [80]:
# Load user ratings data
import pandas as pd
ratings_path = "ratings.csv"
movies_metadata_path = "movies_metadata.csv"

# Load datasets
temp = pd.read_csv(ratings_path)
temp2 = pd.read_csv(movies_metadata_path)

# Select relevant columns
temp2 = temp2[['id', 'original_title']]
temp2['id'] = pd.to_numeric(temp2['id'], errors='coerce')

# Merge datasets
user_ratings_df = temp.merge(temp2, left_on='movieId', right_on='id', how='left')
user_ratings_df = user_ratings_df.iloc[:len(user_ratings_df) // 2]
user_ratings_df.dropna(inplace=True)
user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,original_title
0,1,110,1.0,1425941529,110.0,Trois couleurs : Rouge
1,1,147,4.5,1425942435,147.0,Les Quatre Cents Coups
2,1,858,5.0,1425941523,858.0,Sleepless in Seattle
4,1,1246,5.0,1425941556,1246.0,Rocky Balboa
5,1,1968,4.0,1425942148,1968.0,Fools Rush In


In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score, accuracy_score
import numpy as np


user_ratings_df=user_ratings_df.head(1000)
# Split the data into training and test sets
train_df, test_df = train_test_split(user_ratings_df, test_size=0.2, random_state=42)

# Create the user-item matrix for the training set
train_user_item_matrix = train_df.pivot_table(index='userId', columns='movieId', values='rating')
train_user_item_matrix.fillna(0, inplace=True)

# Compute item-item similarity using cosine similarity
item_similarity = cosine_similarity(train_user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=train_user_item_matrix.columns, columns=train_user_item_matrix.columns)

In [82]:
def predict_rating(user_id, movie_id):
    # Check if the user_id exists in the training data
    if user_id not in train_user_item_matrix.index:
        return np.nan
    
    # Check if the movie_id exists in the similarity matrix
    if movie_id not in item_similarity_df.index:
        return np.nan
    
    similar_movies = item_similarity_df[movie_id]
    user_ratings = train_user_item_matrix.loc[user_id]
    
    # Filter out movies the user hasn't rated
    relevant_ratings = user_ratings[user_ratings > 0]
    
    if relevant_ratings.empty:
        return np.nan  # Return NaN if no relevant ratings are available
    
    # Calculate the weighted sum of ratings
    weighted_sum = np.dot(similar_movies.loc[relevant_ratings.index], relevant_ratings)
    sum_of_weights = similar_movies.loc[relevant_ratings.index].sum()
    
    # Return the predicted rating (weighted average)
    return weighted_sum / sum_of_weights if sum_of_weights != 0 else np.nan

# Generate predictions for the test set
test_df['predicted_rating'] = test_df.apply(lambda row: predict_rating(row['userId'], row['movieId']), axis=1)

# Drop rows where prediction couldn't be made (NaN values)
test_df = test_df.dropna(subset=['predicted_rating'])

# Extract actual and predicted ratings
actual_ratings = test_df['rating'].values
predicted_ratings = test_df['predicted_rating'].values

# Calculate RMSE
mse = mean_squared_error(actual_ratings, predicted_ratings)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# Convert ratings to binary relevance (1 for relevant, 0 for not relevant)
threshold = 3.5
actual_relevant = [1 if rating >= threshold else 0 for rating in actual_ratings]
predicted_relevant = [1 if rating >= threshold else 0 for rating in predicted_ratings]

# Calculate Precision, Recall, F1-Score, and Accuracy
precision = precision_score(actual_relevant, predicted_relevant)
recall = recall_score(actual_relevant, predicted_relevant)
f1 = f1_score(actual_relevant, predicted_relevant)
accuracy = accuracy_score(actual_relevant, predicted_relevant)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'Accuracy: {accuracy}')

RMSE: 1.0960155612124431
Precision: 0.7441860465116279
Recall: 0.9142857142857143
F1-Score: 0.8205128205128205
Accuracy: 0.7052631578947368


In [83]:
# Function to recommend movies for a given user
def recommend_movies(user_id, num_recommendations=10):
    user_ratings = train_user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index
    
    predicted_ratings = []
    for movie_id in unrated_movies:
        predicted_rating = predict_rating(user_id, movie_id)
        if not np.isnan(predicted_rating):
            predicted_ratings.append((movie_id, predicted_rating))
    
    # Sort movies by predicted rating in descending order
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get top N recommended movies
    recommended_movie_ids = [movie_id for movie_id, _ in predicted_ratings[:num_recommendations]]
    
    # Return the original titles of these movies
    recommended_movies = user_ratings_df[user_ratings_df['movieId'].isin(recommended_movie_ids)]['original_title'].unique()
    return recommended_movies

# Example usage: Recommend movies for a specific userId
user_id_to_recommend = 1  # Replace with your actual userId
recommended_movies = recommend_movies(user_id_to_recommend, num_recommendations=10)

print("\nRecommended movies:")
for movie in recommended_movies:
    print(movie)


Recommended movies:
Dancer in the Dark
Miami Vice
The Lord of the Rings
Le Grand Bleu
The Killing
Das weisse Rauschen
Barton Fink
Festen
Magnolia
Walk the Line


In [84]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr,popularity_norm,wr_norm
0,Toy Story,1995,5415,7,21.946943,"Animation,Comedy,Family",6.869837,0.118587,0.760166
1,Jumanji,1995,2413,6,17.015539,"Adventure,Fantasy,Family",5.884942,0.091941,0.534731
2,Grumpier Old Men,1995,92,6,11.7129,"Romance,Comedy",5.377004,0.063289,0.418467
3,Waiting to Exhale,1995,34,6,3.859495,"Comedy,Drama,Romance",5.299754,0.020854,0.400785
4,Father of the Bride Part II,1995,173,5,8.387519,Comedy,5.175047,0.045321,0.37224


## Matrix Factorization (SVD) + Sentiment Recommandation

In [85]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")
overview= pd.read_csv("movies_metadata.csv")
overview.columns
temp = overview[['original_title','overview']]

advanced_matrix = qualified.merge(temp, left_on='title', right_on='original_title', how='left')
# Dropping the 'original_title' column if it's no longer needed
advanced_matrix.drop(columns=['original_title'], inplace=True)
advanced_matrix=advanced_matrix.iloc[:len(advanced_matrix)//2]
advanced_matrix.head()
advanced_matrix['genres'] = advanced_matrix['genres'].fillna('')
# Convert genres into binary features
mlb = MultiLabelBinarizer()
genres_matrix = pd.DataFrame(
    mlb.fit_transform(advanced_matrix['genres'].str.split(',')),
    columns=mlb.classes_,
    index=advanced_matrix.index
)


# Load user ratings data
ratings_path = "ratings.csv"
movies_metadata_path = "movies_metadata.csv"

# Load datasets
temp = pd.read_csv(ratings_path)
temp2 = pd.read_csv(movies_metadata_path)

# Select relevant columns
temp2 = temp2[['id', 'original_title']]
temp2['id'] = pd.to_numeric(temp2['id'], errors='coerce')

# Merge datasets
user_ratings_df = temp.merge(temp2, left_on='movieId', right_on='id', how='left')
user_ratings_df = user_ratings_df.iloc[:len(user_ratings_df) // 2]
user_ratings_df.dropna(inplace=True)
user_ratings_df.head()


# Step 1: Extract the columns
advanced_matrix['title'] = advanced_matrix['title'].str.lower()
user_ratings_df['original_title'] = user_ratings_df['original_title'].str.lower()
col1 = advanced_matrix['title']
col2 = user_ratings_df['original_title']
# Step 2: Convert to sets
set1 = set(col1)
set2 = set(col2)
# Step 3: Find the intersection
common_items = set1.intersection(set2)
print(f"Number of common items {len(common_items)}")

advanced_matrix = advanced_matrix[advanced_matrix['title'].isin(common_items)]
user_ratings_df = user_ratings_df[user_ratings_df['original_title'].isin(common_items)]

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['vote_count', 'vote_average', 'popularity', 'wr']
advanced_matrix[numerical_features] = scaler.fit_transform(advanced_matrix[numerical_features])

# Handle year as a categorical feature
advanced_matrix['year'] = pd.Categorical(advanced_matrix['year'])
year_dummies = pd.get_dummies(advanced_matrix['year'], prefix='year')

# Combine all features into a single user_item_matrix
user_item_matrix = pd.concat([advanced_matrix[numerical_features], genres_matrix, year_dummies], axis=1)
user_item_matrix.fillna(0, inplace=True)

# Perform Truncated SVD
svd = TruncatedSVD(n_components=min(10, user_item_matrix.shape[1]), random_state=42)
predicted_ratings = svd.fit_transform(user_item_matrix)

# Initialize spaCy NLP model
nlp = spacy.load('en_core_web_sm')

# Function to extract keywords from movie overviews
def extract_keywords(text, top_n=3):
    doc = nlp(text)
    keywords = [token.text for token in doc if token.is_alpha and not token.is_stop]
    keywords = pd.Series(keywords).value_counts().head(top_n).index.tolist()
    return keywords

# Function to perform sentiment analysis on overviews
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

# Interpret sentiment score
def interpret_sentiment(score):
    if score >= 0.5:
        return "strongly positive"
    elif score > 0:
        return "positive"
    elif score == 0:
        return "neutral"
    elif score > -0.5:
        return "negative"
    else:
        return "strongly negative"

# Adding keywords and sentiment to the dataset
advanced_matrix['overview'] = advanced_matrix['overview'].fillna('')
advanced_matrix['keywords'] = advanced_matrix['overview'].apply(extract_keywords)
advanced_matrix['sentiment'] = advanced_matrix['overview'].apply(analyze_sentiment)
advanced_matrix = advanced_matrix.set_index('title')

# Compute cosine similarity matrix based on content features
content_similarity = cosine_similarity(user_item_matrix)



# Check for duplicates
duplicates = user_ratings_df[user_ratings_df.duplicated(subset=['userId', 'original_title'], keep=False)]
if not duplicates.empty:
    print("Duplicates found. Handling duplicates by taking the mean rating.")
    # Aggregate duplicates by taking the mean rating
    user_ratings_df = user_ratings_df.groupby(['userId', 'original_title'], as_index=False).mean()

# Create user-item interaction matrix
user_item_interaction = user_ratings_df.pivot(index='userId', columns='original_title', values='rating').fillna(0)

# Convert user-item interaction to a sparse matrix
user_item_sparse = csr_matrix(user_item_interaction.values)

# Calculate cosine similarity between items (movies)
item_similarity = cosine_similarity(user_item_sparse.T)
print("Complete")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/arjun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Number of common items 133
Duplicates found. Handling duplicates by taking the mean rating.
Complete


In [86]:
advanced_matrix

Unnamed: 0_level_0,year,vote_count,vote_average,popularity,genres,wr,popularity_norm,wr_norm,overview,keywords,sentiment
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
toy story,1995,0.624567,0.875,0.155700,"Animation,Comedy,Family",0.688430,0.118587,0.760166,"Led by Woody, Andy's toys live happily in his ...","[Andy, Buzz, Woody]",0.1280
jumanji,1995,0.278316,0.750,0.120713,"Adventure,Fantasy,Family",0.381226,0.091941,0.534731,When siblings Judy and Peter discover an encha...,"[game, Alan, siblings]",-0.4588
heat,1995,0.217532,0.875,0.127165,"Action,Crime,Drama,Thriller",0.626666,0.096854,0.714842,"Obsessive master thief, Neil McCauley leads a ...","[Obsessive, master, end]",-0.8750
heat,1995,0.217532,0.875,0.127165,"Action,Crime,Drama,Thriller",0.626666,0.096854,0.714842,"Former child star Joe Davis (Joe Dallesandro),...","[Joe, Jessica, meets]",-0.6369
heat,1995,0.217532,0.875,0.127165,"Action,Crime,Drama,Thriller",0.626666,0.096854,0.714842,Reynolds plays an ex-soldier-of-fortunish char...,"[Reynolds, plays, ex]",-0.1280
...,...,...,...,...,...,...,...,...,...,...,...
mr. jones,1993,0.004037,0.625,0.046673,"Drama,Romance",0.175873,0.035553,0.384036,The story about the relationship between a man...,"[story, relationship, manic]",0.1689
mr. jones,1993,0.004037,0.625,0.046673,"Drama,Romance",0.175873,0.035553,0.384036,Scott (Jon Foster) is a filmmaker in need of i...,"[Jones, Penny, Scott]",0.8860
mrs. doubtfire,1993,0.188927,0.875,0.079208,"Comedy,Drama,Family",0.614413,0.060331,0.705850,"Loving but irresponsible dad Daniel Hillard, e...","[Daniel, Loving, ex]",-0.6858
the piano,1993,0.033449,0.875,0.052753,"Drama,Romance",0.400936,0.040183,0.549194,"After a long voyage from Scotland, pianist Ada...","[Ada, Alisdair, long]",-0.5209


In [87]:
user_ratings_df

Unnamed: 0,userId,original_title,movieId,rating,timestamp,id
0,2,four rooms,5.0,3.0,8.670392e+08,5.0
1,2,interview with the vampire,628.0,4.0,8.670393e+08,628.0
2,2,mrs. doubtfire,788.0,1.0,8.670393e+08,788.0
3,3,once were warriors,527.0,4.0,1.048077e+09,527.0
4,4,don juan demarco,1909.0,4.0,1.042675e+09,1909.0
...,...,...,...,...,...,...
218123,135160,what's eating gilbert grape,1587.0,2.5,1.253177e+09,1587.0
218124,135161,"faster, pussycat! kill! kill!",315.0,3.0,8.280009e+08,315.0
218125,135161,jurassic park,329.0,2.0,8.280009e+08,329.0
218126,135163,once were warriors,527.0,4.5,1.457791e+09,527.0


In [88]:
def hybrid_recommendations(movie_title, user_id, top_n=5):
    """
    Recommend movies using a hybrid method (excluding content-based filtering). Defaults to SVD-based recommendations if the user ID is not found.
    """
    # Check if the movie exists in the DataFrame index
    if movie_title not in advanced_matrix.index:
        raise ValueError(f"Movie '{movie_title}' not found in the dataset.")
    
    # Find the index of the movie in the DataFrame
    movie_idx = advanced_matrix.index.get_loc(movie_title)
    
    # Collaborative filtering recommendations
    collaborative_indices = []
    
    if user_id not in user_item_interaction.index:
        print(f"User ID '{user_id}' not found. Defaulting to SVD-based recommendations.")
        collaborative_recommendations = []
    else:
        user_rated_items = user_item_interaction.loc[user_id, :]
        user_rated_items = user_rated_items[user_rated_items > 0]

        if user_rated_items.empty:
            print(f"User ID '{user_id}' has no ratings. Defaulting to SVD-based recommendations.")
        else:
            # Map movie titles to indices
            movie_indices = []
            for title in user_rated_items.index:
                if title in advanced_matrix.index:
                    movie_indices.append(advanced_matrix.index.get_loc(title))
                else:
                    print(f"Warning: Movie '{title}' not found in the advanced_matrix. Skipping it.")

            if len(movie_indices) > 0:
                try:
                    # Check the dimensions of item_similarity
                    print(f"item_similarity shape: {item_similarity.shape}")
                    
                    # Filter indices to ensure they are within bounds
                    valid_indices = [i for i in movie_indices if i < item_similarity.shape[0]]
                    if len(valid_indices) == 0:
                        print("No valid indices for collaborative filtering.")
                        collaborative_recommendations = []
                    else:
                        similarity_scores = item_similarity[np.ix_(valid_indices, range(item_similarity.shape[1]))].sum(axis=0)
                        collaborative_indices = np.argsort(similarity_scores)[::-1]
                        # Handle out-of-bounds indices by filtering valid indices only
                        collaborative_indices = [i for i in collaborative_indices if i < len(advanced_matrix.index)]
                except ValueError as e:
                    print(f"Error in collaborative filtering: {e}")
                    collaborative_recommendations = []

    # SVD-based predicted ratings for the selected movie
    if user_id in user_item_interaction.index:
        user_idx = user_item_interaction.index.get_loc(user_id)
        svd_scores = predicted_ratings[user_idx, :]
    else:
        # Default to average scores from SVD
        svd_scores = np.mean(predicted_ratings, axis=0)
        
    svd_indices = np.argsort(svd_scores)[::-1]
    
    # Handle out-of-bounds indices by filtering valid indices only
    svd_indices = [i for i in svd_indices if i < len(advanced_matrix.index)]
    
    # Combine recommendations from both methods if available
    all_recommendations = np.unique(np.concatenate((collaborative_indices, svd_indices)))

    # Generate explanations for each recommended movie
    recommendations = []
    for idx in all_recommendations[:top_n]:
        if idx == movie_idx:
            continue  # Skip the original movie

        movie = advanced_matrix.iloc[idx]
        similar_movie = advanced_matrix.iloc[movie_idx]
        
        explanation = (
            f"This movie is recommended because you enjoyed '{similar_movie.name}' "
            f"({movie['year']}). The movie's sentiment score is {movie['sentiment']:.2f}, "
            f"indicating a {interpret_sentiment(movie['sentiment'])} sentiment. Additionally, "
            f"other users with similar tastes also liked this movie."
        )
        
        recommendations.append((movie.name, explanation))
    
    return recommendations


In [90]:
# Example user_id and movie_title
user_id = 2  # Replace with the user ID you want to get recommendations for
movie_title = 'casino'  # Replace with the title you want recommendations for

# Call the function and get recommendations
try:
    recommendations = hybrid_recommendations(
        movie_title=movie_title,
        user_id=user_id,
        top_n=10 # Number of recommendations you want to get
    )
    
    # Print recommendations
    for movie, explanation in recommendations:
        print(f"Recommended Movie: {movie}")
        print(f"Explanation: {explanation}")
        print()
        
except ValueError as e:
    print(e)


item_similarity shape: (133, 133)
Recommended Movie: toy story
Explanation: This movie is recommended because you enjoyed 'casino' (1995). The movie's sentiment score is 0.13, indicating a positive sentiment. Additionally, other users with similar tastes also liked this movie.

Recommended Movie: jumanji
Explanation: This movie is recommended because you enjoyed 'casino' (1995). The movie's sentiment score is -0.46, indicating a negative sentiment. Additionally, other users with similar tastes also liked this movie.

Recommended Movie: heat
Explanation: This movie is recommended because you enjoyed 'casino' (1995). The movie's sentiment score is -0.88, indicating a strongly negative sentiment. Additionally, other users with similar tastes also liked this movie.

Recommended Movie: heat
Explanation: This movie is recommended because you enjoyed 'casino' (1995). The movie's sentiment score is -0.64, indicating a strongly negative sentiment. Additionally, other users with similar tastes a