In [1]:
import numpy as np
import pandas as pd
import ast
import nltk
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Load datasets
movies_df = pd.read_csv("datasets/tmdb_5000_movies.csv")
credits_df = pd.read_csv("datasets/tmdb_5000_credits.csv")

In [3]:
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
# Data cleaning and preparation for movies_df
movies_df_clean = movies_df[[
    "genres", "id", "original_title", "overview",
    "production_companies", "production_countries",
    "release_date", "vote_average", "vote_count"
]]

In [6]:
movies_df_clean = movies_df_clean.dropna()

In [7]:
def extract_names_from_json(json_str):
    try:
        if not pd.isna(json_str):
            names = [entry["name"] for entry in ast.literal_eval(json_str)]
            return ",".join(names)
        return np.nan
    except:
        return np.nan

In [8]:
movies_df_clean["genres"] = movies_df_clean["genres"].apply(extract_names_from_json)
movies_df_clean["production_companies"] = movies_df_clean["production_companies"].apply(extract_names_from_json)
movies_df_clean["production_countries"] = movies_df_clean["production_countries"].apply(extract_names_from_json)

In [9]:
# Data cleaning and preparation for credits_df
def extract_cast_names(json_str):
    names = [entry["name"] for entry in ast.literal_eval(json_str)]
    return ",".join(names)

In [10]:
def extract_crew_names(json_str):
    crew_roles = ["Producer", "Director", "Writer", "Novel", "Screenplay"]
    names = [entry["name"] for entry in ast.literal_eval(json_str) if entry["job"] in crew_roles]
    return ",".join(names)

In [11]:
credits_df["cast_names"] = credits_df["cast"].apply(extract_cast_names)
credits_df["crew_names"] = credits_df["crew"].apply(extract_crew_names)

In [12]:
credits_df_clean = credits_df[["movie_id", "cast_names", "crew_names"]]
credits_df_clean.rename(columns={"movie_id": "id"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credits_df_clean.rename(columns={"movie_id": "id"}, inplace=True)


In [13]:
# Merge datasets
merged_df = pd.merge(movies_df_clean, credits_df_clean, on='id', how='inner')

In [14]:
# Data preparation for content-based recommendations
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
content_df = merged_df[[
    "genres", "original_title", "overview",
    "production_companies", "production_countries",
    "cast_names", "crew_names"
]]


In [16]:
def tokenize_column(column):
    return str(column).lower().split(",") if not pd.isna(column) else ""

In [17]:
content_df["genres"] = content_df["genres"].apply(tokenize_column)
content_df["overview"] = content_df["overview"].apply(tokenize_column)
content_df["production_companies"] = content_df["production_companies"].apply(tokenize_column)
content_df["production_countries"] = content_df["production_countries"].apply(tokenize_column)
content_df["cast_names"] = content_df["cast_names"].apply(tokenize_column)
content_df["crew_names"] = content_df["crew_names"].apply(tokenize_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["genres"] = content_df["genres"].apply(tokenize_column)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["overview"] = content_df["overview"].apply(tokenize_column)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["production_companies"] = content_df["production_compani

In [18]:
def clean_names(name_list):
    return [name.replace(" ", "") for name in name_list]

In [19]:
content_df["cast_names"] = content_df["cast_names"].apply(clean_names)
content_df["crew_names"] = content_df["crew_names"].apply(clean_names)
content_df["production_companies"] = content_df["production_companies"].apply(clean_names)
content_df["production_countries"] = content_df["production_countries"].apply(clean_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["cast_names"] = content_df["cast_names"].apply(clean_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["crew_names"] = content_df["crew_names"].apply(clean_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["production_companies"] = content_df["production_com

In [20]:
def extract_keywords(text):
    if isinstance(text, str):  # Check if text is a string
        rake = Rake()
        rake.extract_keywords_from_text(text)
        return list(rake.get_word_degrees().keys())
    return []

In [21]:
content_df["overview"] = content_df["overview"].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
content_df["overview"] = content_df["overview"].apply(extract_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["overview"] = content_df["overview"].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["overview"] = content_df["overview"].apply(extract_keywords)


In [22]:
content_df.set_index('original_title', inplace=True)

In [23]:
content_df["combined_features"] = content_df.apply(lambda row: [item for sublist in row for item in sublist], axis=1)
content_df["combined_features"] = content_df["combined_features"].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["combined_features"] = content_df.apply(lambda row: [item for sublist in row for item in sublist], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df["combined_features"] = content_df["combined_features"].apply(lambda x: " ".join(x))


In [24]:
content_df

Unnamed: 0_level_0,genres,overview,production_companies,production_countries,cast_names,crew_names,combined_features
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Avatar,"[action, adventure, fantasy, science fiction]","[22nd, century, paraplegic, marine, dispatched...","[ingeniousfilmpartners, twentiethcenturyfoxfil...","[unitedstatesofamerica, unitedkingdom]","[samworthington, zoesaldana, sigourneyweaver, ...","[jamescameron, jamescameron, jamescameron, jam...",action adventure fantasy science fiction 22nd ...
Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[captain, barbossa, long, believed, dead, come...","[waltdisneypictures, jerrybruckheimerfilms, se...",[unitedstatesofamerica],"[johnnydepp, orlandobloom, keiraknightley, ste...","[goreverbinski, jerrybruckheimer, tedelliott, ...",adventure fantasy action captain barbossa long...
Spectre,"[action, adventure, crime]","[cryptic, message, bond, ’, past, sends, trail...","[columbiapictures, danjaq, b24]","[unitedkingdom, unitedstatesofamerica]","[danielcraig, christophwaltz, léaseydoux, ralp...","[sammendes, johnlogan, barbarabroccoli, robert...",action adventure crime cryptic message bond ’ ...
The Dark Knight Rises,"[action, crime, drama, thriller]","[following, death, district, attorney, harvey,...","[legendarypictures, warnerbros., dcentertainme...",[unitedstatesofamerica],"[christianbale, michaelcaine, garyoldman, anne...","[charlesroven, christophernolan, christopherno...",action crime drama thriller following death di...
John Carter,"[action, adventure, science fiction]","[john, carter, war, weary, former, military, c...",[waltdisneypictures],[unitedstatesofamerica],"[taylorkitsch, lynncollins, samanthamorton, wi...","[andrewstanton, andrewstanton, colinwilson, ji...",action adventure science fiction john carter w...
...,...,...,...,...,...,...,...
El Mariachi,"[action, crime, thriller]","[el, mariachi, wants, play, guitar, carry, fam...",[columbiapictures],"[mexico, unitedstatesofamerica]","[carlosgallardo, jaimedehoyos, petermarquardt,...","[robertrodriguez, robertrodriguez, carlosgalla...",action crime thriller el mariachi wants play g...
Newlyweds,"[comedy, romance]","[newlywed, couple, honeymoon, upended, arrival...",[],[],"[edwardburns, kerrybishé, marshadietlein, cait...","[edwardburns, edwardburns, edwardburns, willia...",comedy romance newlywed couple honeymoon upend...
"Signed, Sealed, Delivered","[comedy, drama, romance, tv movie]","[signed, sealed, delivered, introduces, dedica...","[frontstreetpictures, museentertainmententerpr...",[unitedstatesofamerica],"[ericmabius, kristinbooth, crystallowe, geoffg...","[harveykahn, marthawilliamson, scottsmith]",comedy drama romance tv movie signed sealed de...
Shanghai Calling,[],"[ambitious, new, york, attorney, sam, sent, sh...",[],"[unitedstatesofamerica, china]","[danielhenney, elizacoupe, billpaxton, alanruc...","[danielhsia, danielhsia]",ambitious new york attorney sam sent shanghai...


In [25]:
content_df["combined_features"].iloc[0]

'action adventure fantasy science fiction 22nd century paraplegic marine dispatched moon pandora unique mission becomes torn following orders protecting alien civilization ingeniousfilmpartners twentiethcenturyfoxfilmcorporation duneentertainment lightstormentertainment unitedstatesofamerica unitedkingdom samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi joeldavidmoore cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyte scottlawrence kellykilgour jamespatrickpitt seanpatrickmurphy peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblain-rozgay joncurry lukehawker woodyschultz petermensah soniayee jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibby jodielandau julielamm cullenb.madden josephbradymadden frankietorres austinwilson sarawilson tamicawashington-miller lucybriant nathanmeister gerryblair matthewchamberlain paulyates wraywilson jamesgaylyn melvinlenoclarkiii carvon

In [36]:
# Build content-based recommendation model
features_df = pd.DataFrame(content_df["combined_features"])
vectorizer = CountVectorizer()
feature_matrix = vectorizer.fit_transform(features_df["combined_features"])
cosine_sim_matrix = cosine_similarity(feature_matrix, feature_matrix)

In [37]:
with open('models/cosine_sim.pkl', 'wb') as file:
    pickle.dump(cosine_sim_matrix, file)

In [38]:
title_indices = pd.Series(features_df.index)

In [39]:
def get_content_based_recommendations(title, cosine_sim_matrix=cosine_sim_matrix):
    idx = title_indices[title_indices == title].index[0]
    similarity_scores = pd.Series(cosine_sim_matrix[idx]).sort_values(ascending=False)
    top_10_indices = list(similarity_scores.iloc[1:11].index)
    return [features_df.index[i] for i in top_10_indices]

In [40]:
print(get_content_based_recommendations("The Yards"))

['The Immigrant', "Amidst the Devil's Wings", 'We Own the Night', 'The Way of the Gun', 'Double Jeopardy', 'Romeo Is Bleeding', 'Broken City', 'Two Lovers', 'Kiss of Death', 'Blood Ties']


In [47]:
# Based on production companies
df_production = merged_df[[
    "original_title", "id", "production_companies",
    "vote_average", "vote_count"
]]

In [49]:
def clean_production_names(name_str):
    if pd.isna(name_str):
        return ""
    else:
        return " ".join([name.replace(" ", "") for name in name_str.split(",")])

In [51]:
df_production["production_companies"] = df_production["production_companies"].apply(clean_production_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_production["production_companies"] = df_production["production_companies"].apply(clean_production_names)


In [53]:
# Compute weighted rating
mean_vote = df_production['vote_average'].mean()
vote_threshold = df_production['vote_count'].quantile(0.75)

In [55]:
def compute_weighted_rating(row, vote_threshold=vote_threshold, mean_vote=mean_vote):
    votes = row['vote_count']
    rating = row['vote_average']
    return (votes/(votes + vote_threshold) * rating) + (vote_threshold/(vote_threshold + votes) * mean_vote)

In [57]:
df_production['weighted_rating'] = df_production.apply(compute_weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_production['weighted_rating'] = df_production.apply(compute_weighted_rating, axis=1)


In [59]:
vectorizer_prod = CountVectorizer()
prod_feature_matrix = vectorizer_prod.fit_transform(df_production['production_companies'])
cosine_sim_prod_matrix = cosine_similarity(prod_feature_matrix, prod_feature_matrix)

In [61]:
with open('models/cosine_sim_prod.pkl', 'wb') as file:
    pickle.dump(cosine_sim_prod_matrix, file)

In [63]:
df_weighted_ratings = df_production[["original_title", "id", "weighted_rating"]]

In [65]:
df_weighted_ratings.to_csv("datasets/main_df.csv", index = False)

In [67]:
def get_combined_recommendations(title, cosine_sim_prod_matrix=cosine_sim_prod_matrix, df_production=df_weighted_ratings):
    idx = df_production.index[df_production['original_title'] == title][0]
    sim_scores = list(enumerate(cosine_sim_prod_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:50]  # Get top 50 similar movies

    # Adjust scores based on weighted rating
    movie_indices = [i[0] for i in sim_scores]
    movies = df_production.iloc[movie_indices].copy()
    
    movies = movies.sort_values('weighted_rating', ascending=False)
    movies = movies[movies["original_title"] != title]
    movies = movies[movies["weighted_rating"] > 5.8]
    movies = movies[['original_title', "id", 'weighted_rating']]
    random_movies = df_production.sample(n=10)
    return list(random_movies["original_title"]), list(random_movies["id"])

In [69]:
print(get_combined_recommendations("The Avengers", df_production=df_weighted_ratings))

(['The Thief and the Cobbler', 'Dreamgirls', 'The Pet', 'The Postman', 'Inescapable', '4: Rise of the Silver Surfer', 'Civil Brand', 'A Streetcar Named Desire', 'Last Action Hero', 'Free Style'], [26672, 1125, 52462, 9922, 121676, 1979, 30246, 702, 9593, 39055])
