In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [3]:
df = pd.read_csv("../data/processed/movie_data_processed.csv", index_col= False)
df["Year"] = df["Year"].fillna("Unknown")
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",United States,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,9.465909


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6667 entries, 0 to 6666
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6667 non-null   object 
 1   Year        6667 non-null   object 
 2   Rated       6667 non-null   object 
 3   Runtime     6667 non-null   object 
 4   Genre       6667 non-null   object 
 5   Director    6667 non-null   object 
 6   Writer      6667 non-null   object 
 7   Actors      6667 non-null   object 
 8   Plot        6667 non-null   object 
 9   Language    6667 non-null   object 
 10  Country     6667 non-null   object 
 11  Awards      6667 non-null   object 
 12  Poster      6667 non-null   object 
 13  Ratings     6667 non-null   object 
 14  imdbRating  6667 non-null   float64
 15  imdbVotes   6667 non-null   int64  
 16  imdbID      6667 non-null   object 
 17  BoxOffice   6667 non-null   object 
 18  wr          6667 non-null   float64
dtypes: float64(2), int64(1), ob

# 1. Simple Recommender

> Đưa ra các đề xuất tổng quát cho mọi người dùng, dựa trên mức độ phổ biến và/hoặc thể loại phim. Ý tưởng cơ bản đằng sau hệ thống này là những bộ phim nổi tiếng hơn và được giới phê bình đánh giá cao hơn sẽ có xác suất được khán giả bình thường thích cao hơn. Một ví dụ có thể là IMDB Top 250

In [5]:
def build_chart(df, genre, percentile=0.85):
    # Lọc các dòng chứa genre
    filtered_df = df[df['Genre'].str.contains(genre, na=False)]
    
    # Chuyển các cột cần thiết về kiểu số
    vote_counts = filtered_df[filtered_df['imdbVotes'].notnull()]['imdbVotes'].astype('int')
    vote_averages = filtered_df[filtered_df['imdbRating'].notnull()]['imdbRating'].astype('float')
    
    # Tính giá trị trung bình và ngưỡng
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    # Chọn các dòng đủ điều kiện
    qualified = filtered_df[
        (filtered_df['imdbVotes'] >= m) & 
        (filtered_df['imdbVotes'].notnull()) & 
        (filtered_df['imdbRating'].notnull())
    ][['Title', 'Year', 'Genre','imdbVotes', 'imdbRating']]
    
    # Chuyển các giá trị cần thiết về kiểu số
    qualified['imdbVotes'] = qualified['imdbVotes'].astype('int')
    qualified['imdbRating'] = qualified['imdbRating'].astype('float')
    
    # Tính trọng số (weighted rating)
    qualified['wr'] = qualified.apply(
        lambda x: (x['imdbVotes'] / (x['imdbVotes'] + m) * x['imdbRating']) + 
                  (m / (m + x['imdbVotes']) * C), 
        axis=1
    )
    
    # Sắp xếp theo trọng số và lấy top 250
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified


In [6]:
result = build_chart(df, 'Drama', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
0,Breaking Bad,Unknown,"Crime, Drama, Thriller",2225876,9.5,9.316451
1,The Shawshank Redemption,1994-01-01,Drama,2945396,9.3,9.171003
4,Game of Thrones,Unknown,"Action, Adventure, Drama",2359163,9.2,9.049111
5,The Godfather,1972-01-01,"Crime, Drama",2048917,9.2,9.028265
3,Chernobyl,2019-01-01,"Drama, History, Thriller",902682,9.3,8.93132
14,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.888878
15,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.84346
2,Band of Brothers,2001-01-01,"Drama, History, War",542582,9.4,8.825132
11,Sherlock,Unknown,"Crime, Drama, Mystery",1020664,9.1,8.799367
16,Schindler's List,1993-01-01,"Biography, Drama, History",1474085,9.0,8.792726


In [7]:
result = build_chart(df, 'Action', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
4,Game of Thrones,Unknown,"Action, Adventure, Drama",2359163,9.2,8.907483
14,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.779428
15,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.695464
22,The Lord of the Rings: The Fellowship of the Ring,2001-01-01,"Action, Adventure, Drama",2045291,8.9,8.614186
28,Inception,2010-01-01,"Action, Adventure, Sci-Fi",2592712,8.8,8.579365
32,The Lord of the Rings: The Two Towers,2002-01-01,"Action, Adventure, Drama",1819421,8.8,8.501072
40,The Matrix,1999-01-01,"Action, Sci-Fi",2087394,8.7,8.448508
41,Star Wars: Episode V - The Empire Strikes Back,1980-01-01,"Action, Adventure, Fantasy",1400371,8.7,8.350345
58,Star Wars: Episode IV - A New Hope,1977-01-01,"Action, Adventure, Fantasy",1474225,8.6,8.284043
13,Attack on Titan,Unknown,"Animation, Action, Adventure",551083,9.1,8.266332


# 2. Content based

In [8]:
# Dùng TF-IDF Vectorizer để biến nội dung cột Plot thành ma trận đặc trưng dạng vector, bỏ qua các từ dừng (stop words).
tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(df['Plot'])

In [9]:
tfidf_matrix.shape

(6667, 16321)

In [11]:
# Tạo một Series để tra chỉ số phim từ Title.
indices = pd.Series(df.index, index=df['Title'])
indices

Title
Breaking Bad                   0
The Shawshank Redemption       1
Band of Brothers               2
Chernobyl                      3
Game of Thrones                4
                            ... 
Suspect Zero                6662
The Reef                    6663
Silent Night                6664
Absentia                    6665
Life of Crime               6666
Length: 6667, dtype: int64

In [54]:
def get_content_based_recommendations(movie_title, similarity_scores):
    # Fetch index of movie based on given title
    movie_idx = indices[movie_title]
    
    # Fetch similarity score of all movies with the given movie
    # Fetch it as a tuple of (index, score)
    similarity_scores = list(enumerate(similarity_scores[movie_idx]))
    
    # Sort the above score
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Pick index and score of 10 most similar movies
    # Skip the 0th index since it is same movie (itself)
    similarity_scores = similarity_scores[1:11]
    
    # Find the indices of these similar movies
    movie_similar_indices = [i[0] for i in similarity_scores]
    
    # Find title of these top movies and return
    return df.iloc[movie_similar_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [14]:
# tính độ giống nhau (similarity) giữa từng cặp phim với phim, dựa trên đặc trưng văn bản (ở đây là Plot được biểu diễn bằng TF-IDF vector)
similarity = cosine_similarity(tfidf_matrix)

In [56]:
get_content_based_recommendations('The Dark Knight', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
213,Batman Begins,tt0372784,2005-01-01,"Action, Crime, Drama",1601743,8.2,8.169103
221,Kill Bill: Vol. 1,tt0266697,2003-01-01,"Action, Crime, Thriller",1214597,8.2,8.159417
565,The Batman,tt1877830,2022-01-01,"Action, Crime, Drama",828211,7.8,7.750532
681,The Penguin,tt15435876,2024-01-01,"Crime, Drama, Fantasy",33845,8.8,7.649955
698,Gotham,tt3749900,Unknown,"Action, Crime, Drama",243328,7.8,7.640803
1124,Batman,tt0096895,1989-01-01,"Action, Adventure",412616,7.5,7.41715
2194,Batman Returns,tt0103776,1992-01-01,"Action, Crime, Fantasy",332740,7.1,7.021463
5220,Batman: The Killing Joke,tt4853102,2016-01-01,"Animation, Action, Crime",62497,6.4,6.237243
6337,Becky,tt10314450,2020-01-01,"Action, Crime, Drama",25862,6.0,5.884377
6414,Domino,tt0421054,2005-01-01,"Action, Biography, Crime",69894,5.9,5.863342


# BEGINNNNNNNNNN

In [17]:
tags_df = pd.read_csv('../data/processed/tags_processed.csv')

In [18]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713408 entries, 0 to 713407
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   userId   713408 non-null  int64 
 1   movieId  713408 non-null  int64 
 2   tag      713408 non-null  object
 3   imdbId   713408 non-null  object
dtypes: int64(2), object(2)
memory usage: 21.8+ MB


In [19]:
tags_grouped = tags_df.groupby('imdbId')['tag'].apply(list).reset_index()
del tags_df
tags_grouped.rename(columns={"imdbId": "imdbID"}, inplace=True)
tags_grouped

Unnamed: 0,imdbID,tag
0,tt0000417,"[classic, experimental, sci-fi, black and whit..."
1,tt0000439,"[less than 300 ratings, not available from Net..."
2,tt0010323,"[german expressionism, psychology, serial kill..."
3,tt0012349,"[Tumey's DVDs, charity, orphan, poverty, silen..."
4,tt0013442,"[Tumey's DVDs, vampire, vampires, gothic, F.W...."
...,...,...
5599,tt9806192,"[animation, beautiful, cinematography, dreamli..."
5600,tt9812474,"[grief, atmospheric, fantasy, A24, atmospheric..."
5601,tt9845564,"[convoy, rapist, revenge, trapped inside, winter]"
5602,tt9873892,"[cheesy, cliche, Black, Blaxploitation, satire]"


In [20]:
df = df.merge(tags_grouped, on='imdbID', how='left')
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr,tag
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",United States,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,9.465909,


In [21]:
tag_counts = df.apply(lambda x: pd.Series(x['tag']),axis=1).stack().reset_index(level=1, drop=True)
tag_counts.name = 'tag'

In [22]:
tag_counts = tag_counts.value_counts()
tag_counts[:5]

tag
sci-fi         4623
atmospheric    4171
action         3805
funny          3318
comedy         3208
Name: count, dtype: int64

In [23]:
tag_counts = tag_counts[tag_counts > 1]

In [24]:
def filter_tags(x):
    words = []
    for i in x:
        if i in tag_counts:
            words.append(i)
    return words

In [25]:
df["tag"] = df["tag"].fillna('')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6667 entries, 0 to 6666
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6667 non-null   object 
 1   Year        6667 non-null   object 
 2   Rated       6667 non-null   object 
 3   Runtime     6667 non-null   object 
 4   Genre       6667 non-null   object 
 5   Director    6667 non-null   object 
 6   Writer      6667 non-null   object 
 7   Actors      6667 non-null   object 
 8   Plot        6667 non-null   object 
 9   Language    6667 non-null   object 
 10  Country     6667 non-null   object 
 11  Awards      6667 non-null   object 
 12  Poster      6667 non-null   object 
 13  Ratings     6667 non-null   object 
 14  imdbRating  6667 non-null   float64
 15  imdbVotes   6667 non-null   int64  
 16  imdbID      6667 non-null   object 
 17  BoxOffice   6667 non-null   object 
 18  wr          6667 non-null   float64
 19  tag         6667 non-null  

In [31]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# === Step 1: Load and preprocess data ===
# Làm sạch các cột cần thiết
df['Director'] = df['Director'].fillna('').astype(str)
df['Actors'] = df['Actors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
df['Genre'] = df['Genre'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
df['tag'] = df['tag'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
df['Plot'] = df['Plot'].fillna('')

# === Step 2: Tạo phiên bản đặc trưng có trọng số ===
df['Director_str'] = df['Director'].apply(lambda x: (x + ' ') * 3)
df['Actors_str'] = df['Actors']
df['Genre_str'] = df['Genre']
df['Tag_str'] = df['tag']
df['Plot_str'] = df['Plot']

# === Step 3: Encode từng phần bằng Sentence-BERT ===
print("Encoding with Sentence-BERT... this may take a while")
model = SentenceTransformer('all-MiniLM-L6-v2')

dir_vecs = model.encode(df['Director_str'].tolist(), show_progress_bar=True)
act_vecs = model.encode(df['Actors_str'].tolist(), show_progress_bar=True)
gen_vecs = model.encode(df['Genre_str'].tolist(), show_progress_bar=True)
tag_vecs = model.encode(df['Tag_str'].tolist(), show_progress_bar=True)
plot_vecs = model.encode(df['Plot_str'].tolist(), show_progress_bar=True)

# === Step 4: Kết hợp các vector thành 1 vector duy nhất ===
combined_vecs = (
    0.3 * dir_vecs +
    0.2 * act_vecs +
    0.2 * gen_vecs +
    0.1 * tag_vecs +
    0.2 * plot_vecs
)

# === Step 5: Tính cosine similarity ===
similarity_matrix = cosine_similarity(combined_vecs)

# === Step 6: Hàm gợi ý phim ===
def get_combined_recommendations(title, similarity_matrix, df, top_n=10):
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

Encoding with Sentence-BERT... this may take a while


Batches: 100%|██████████████████████████████████████████████████████████| 209/209 [00:01<00:00, 113.82it/s]
Batches: 100%|██████████████████████████████████████████████████████████| 209/209 [00:01<00:00, 124.95it/s]
Batches: 100%|██████████████████████████████████████████████████████████| 209/209 [00:01<00:00, 173.02it/s]
Batches: 100%|███████████████████████████████████████████████████████████| 209/209 [00:10<00:00, 20.54it/s]
Batches: 100%|███████████████████████████████████████████████████████████| 209/209 [00:02<00:00, 70.03it/s]


In [32]:
get_combined_recommendations("Iron Man", similarity_matrix, df, top_n=10)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
615,Predator,tt0093773,1987-01-01,"Action, Adventure, Horror",464312,7.8,7.713389
1257,Spider-Man: Homecoming,tt2250912,2017-01-01,"Action, Adventure, Sci-Fi",735722,7.4,7.355247
1729,Chef,tt2883512,2014-01-01,"Adventure, Comedy, Drama",236529,7.3,7.176203
2036,Iron Man 3,tt1300854,2013-01-01,"Action, Adventure, Sci-Fi",911694,7.1,7.070248
2210,Elf,tt0319343,2003-01-01,"Adventure, Comedy, Family",313983,7.1,7.017057
2718,Iron Man 2,tt1228705,2010-01-01,"Action, Sci-Fi",882110,6.9,6.873783
4514,Last Action Hero,tt0107362,1993-01-01,"Action, Adventure, Comedy",165599,6.5,6.41654
5302,Zathura: A Space Adventure,tt0406375,2005-01-01,"Action, Adventure, Comedy",117385,6.3,6.216878
5847,The Sorcerer's Apprentice,tt0963966,2010-01-01,"Action, Adventure, Family",171558,6.1,6.061552
6400,Conan the Destroyer,tt0087078,1984-01-01,"Action, Adventure, Fantasy",86492,5.9,5.869037


# END

In [66]:
reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre']

In [67]:
def cleanUpData(data):
    if isinstance(data, list):
        return [str.lower(val.replace(" ", "")) for val in data]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(data, str):
            return str.lower(data.replace(" ", ""))
        else:
            return ''

In [68]:
# Apply data cleanup to reco features
modified_features = ['Director', 'Actors', 'tag', 'Genre']

for feature in modified_features:
    df[feature] = df[feature].apply(cleanUpData)
    
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,unknown,"bryancranston,aaronpaul,annagunn",,"crime,drama,thriller"
1,The Shawshank Redemption,frankdarabont,"timrobbins,morganfreeman,bobgunton","[basedonabook, morganfreeman, twistending, fri...",drama
2,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"
3,Chernobyl,unknown,"jessiebuckley,jaredharris,stellanskarsgård",,"drama,history,thriller"
4,Game of Thrones,unknown,"emiliaclarke,peterdinklage,kitharington",,"action,adventure,drama"


In [69]:
# Chuyển đổi các cột thành danh sách
df['Director'] = df['Director'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Actors'] = df['Actors'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Genre'] = df['Genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df[reco_features].head(1)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]"


In [70]:
def createSoup(data):
    # Loại bỏ trùng lặp trong mỗi danh sách trước khi nối
    director = ' '.join(data['Director'])
    actors = ' '.join(data['Actors'])
    tag = ' '.join(set(data['tag']))  # Dùng set để loại bỏ trùng lặp
    genre = ' '.join(data['Genre'])
    
    # Ghép tất cả lại thành chuỗi duy nhất
    return f"{director} {actors} {tag} {genre}"

createSoup(df.iloc[0,:])

'unknown bryancranston aaronpaul annagunn  crime drama thriller'

In [71]:
# Create a new feature Soup with mixed data
df['soup'] = df.apply(createSoup, axis=1)

reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre', 'soup']
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre,soup
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]",unknown bryancranston aaronpaul annagunn crim...
1,The Shawshank Redemption,[frankdarabont],"[timrobbins, morganfreeman, bobgunton]","[basedonabook, morganfreeman, twistending, fri...",[drama],frankdarabont timrobbins morganfreeman bobgunt...
2,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...
3,Chernobyl,[unknown],"[jessiebuckley, jaredharris, stellanskarsgård]",,"[drama, history, thriller]",unknown jessiebuckley jaredharris stellanskars...
4,Game of Thrones,[unknown],"[emiliaclarke, peterdinklage, kitharington]",,"[action, adventure, drama]",unknown emiliaclarke peterdinklage kitharingto...


In [72]:
# Define a CountVectorizer Object
from sklearn.feature_extraction.text import CountVectorizer
cntVec = CountVectorizer(stop_words='english')

# Remove NaN from soup with empty strings
df['soup'] = df['soup'].fillna('')

# Construct CountVectorizer matrix by fitting and transforming the data
cntVec_matrix = cntVec.fit_transform(df['soup'])

print("Shape of CountVectorizer matrix =", cntVec_matrix.shape)

# Topmost frequently occuring words
words = cntVec.get_feature_names_out()
counts = cntVec_matrix.sum(axis=0).reshape(-1,1).tolist()
print("Most frequently occuring words in plot overview:")
word_count = dict(sorted(zip(words, counts), key=lambda x : x[1], reverse=True)[:20])
print(word_count)

Shape of CountVectorizer matrix = (6667, 73883)
Most frequently occuring words in plot overview:
{'drama': [4422], 'comedy': [3018], 'action': [2355], 'crime': [1790], 'romance': [1580], 'adventure': [1559], 'thriller': [1354], 'nudity': [1132], 'mystery': [996], 'horror': [906], 'fi': [881], 'unknown': [879], 'murder': [863], 'sci': [863], 'violence': [805], 'fantasy': [763], 'animation': [712], 'funny': [708], 'biography': [691], 'basedonabook': [684]}


In [73]:
# Find recommendations based on Cosine Similarity
similarity = cosine_similarity(cntVec_matrix)
get_content_based_recommendations('Spectre', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
570,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
759,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.60151
1340,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
1563,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.232876
2802,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.853877
4017,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.534086
4750,Plane,tt5884796,2023-01-01,"[action, adventure, thriller]",84481,6.5,6.351961
5305,War,tt7430722,2019-01-01,"[action, adventure, thriller]",34154,6.5,6.215271
5907,Copshop,tt5748448,2021-01-01,"[action, thriller]",40435,6.2,6.045134
6367,Power Rangers,tt3717490,2017-01-01,"[action, adventure, fantasy]",115015,5.9,5.875561


# 3. Item-based Collaborative Filtering

- Based on user's ratings, we create an interaction matrix between user rating and movies, which then we can use to compute correlation between movies,  
 then we filter out 10 movies with highest correlation coefficient.

In [74]:
df_ratings = pd.read_csv('../data/processed/ratings_processed.csv')
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201140 entries, 0 to 21201139
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float64
 2   imdbID  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 485.3+ MB


In [75]:
df_ratings = df_ratings.merge(df[['imdbID', 'Title']], on= 'imdbID', how= 'inner')

df_ratings.head()

Unnamed: 0,userId,rating,imdbID,Title
0,1,8.0,tt0114388,Sense and Sensibility
1,3,10.0,tt0114388,Sense and Sensibility
2,15,9.0,tt0114388,Sense and Sensibility
3,28,8.0,tt0114388,Sense and Sensibility
4,29,8.0,tt0114388,Sense and Sensibility


In [76]:
# Create User-Item interaction matrix
matrix = df_ratings.pivot_table(index='userId', columns='Title', values='rating')

# Free memory
del df_ratings

matrix.head()

Title,10 Cloverfield Lane,10 Things I Hate About You,12 Angry Men,12 Monkeys,12 Years a Slave,127 Hours,13 Going on 30,1408,1917,2 Fast 2 Furious,...,You've Got Mail,Young Frankenstein,Young Guns,Zero Dark Thirty,Zodiac,Zombieland,Zoolander,Zootopia,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,10.0,10.0,,,,,,,...,2.0,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,4.0,
5,,,,,,,,,,,...,,,,,,,,,,


In [77]:
def get_collaborative_filtering_recommendations(movie):
    
    # Fetch ratings for movie
    movie_user_rating = matrix[movie]

    # Find correlation between movies
    similar_to_movie= matrix.corrwith(movie_user_rating)

    # Getting correlated movies
    corr_movies = pd.DataFrame(similar_to_movie, columns=['Correlation'])
    corr_movies = corr_movies.sort_values(by='Correlation', ascending=False)
    
    corr_movies_indeces = corr_movies[1:11].index
    
    return df[df['Title'].isin(corr_movies_indeces)][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [78]:
get_collaborative_filtering_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
391,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.936341
570,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
1340,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
2849,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.843139
3906,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.557166
4299,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468695
4451,Tomorrow Never Dies,tt0120347,1997-01-01,"[action, adventure, thriller]",206514,6.5,6.431592
4622,Diamonds Are Forever,tt0066995,1971-01-01,"[action, adventure, thriller]",115496,6.5,6.385759
4791,The World Is Not Enough,tt0143145,1999-01-01,"[action, adventure, thriller]",211725,6.4,6.341898
5802,Die Another Day,tt0246460,2002-01-01,"[action, adventure, thriller]",231107,6.1,6.070657


# 4. User-based Collaborative Filtering

In [33]:
df_ratings = pd.read_csv('../data/processed/ratings_processed.csv')
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21076831 entries, 0 to 21076830
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float64
 2   imdbID  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 482.4+ MB


In [34]:
df_ratings = df_ratings.merge(df[['imdbID', 'Title']], on= 'imdbID', how= 'inner')

df_ratings.head()

Unnamed: 0,userId,rating,imdbID,Title
0,1,8.0,tt0114388,Sense and Sensibility
1,1,2.0,tt0113627,Leaving Las Vegas
2,1,4.0,tt0112682,The City of Lost Children
3,1,10.0,tt0114746,12 Monkeys
4,1,4.0,tt0112431,Babe


In [37]:
# Tạo User-Item Matrix
user_item_matrix = df_ratings.pivot_table(index='userId', columns='imdbID', values='rating')

In [None]:
# Tính độ tương đồng giữa người dùng (Cosine Similarity)

# Điền giá trị NaN thành 0 tạm thời (có thể cải tiến sau)
user_item_filled = user_item_matrix.fillna(0)

# Tính cosine similarity
user_similarity = cosine_similarity(user_item_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_filled.index, columns=user_item_filled.index)

In [None]:
def predict_rating(user_id, movie_id, k=2):
    if movie_id in user_item_matrix.columns:
        # Các người dùng khác đã đánh giá movie_id
        other_users = user_item_matrix[user_item_matrix[movie_id].notnull()].index
        similarities = user_similarity_df.loc[user_id, other_users]
        ratings = user_item_matrix.loc[other_users, movie_id]

        # Chọn top-k người dùng giống nhất
        top_k_users = similarities.sort_values(ascending=False)[:k]
        top_k_ratings = ratings[top_k_users.index]

        if top_k_ratings.sum() == 0:
            return np.nan  # Không có dữ liệu

        # Dự đoán theo trung bình có trọng số
        weighted_avg = np.dot(top_k_ratings, top_k_users) / top_k_users.sum()
        return weighted_avg
    else:
        return np.nan


In [None]:
def recommend_movies(user_id, top_n=3):
    # Danh sách phim chưa xem
    user_ratings = user_item_matrix.loc[user_id]
    unseen_movies = user_ratings[user_ratings.isna()].index

    predictions = []
    for movie_id in unseen_movies:
        pred = predict_rating(user_id, movie_id)
        if not np.isnan(pred):
            predictions.append((movie_id, pred))

    # Sắp xếp theo điểm dự đoán
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:top_n]


In [None]:
recommendations = recommend_movies(user_id=2, top_n=2)
print(recommendations)

# 5. Weighted-Mixed Hybrid

- Weighted means movies are ranked based on weighted average score.
- Mixed means we take both recommendations lists as the result.

In [79]:
def get_hybrid_recommendations(movie):
    content_based_recommends = get_content_based_recommendations(movie, similarity)
    collaborative_filtering_recommends = get_collaborative_filtering_recommendations(movie)
    
    # Combine 2 recommendations lists
    hybrid_recommends = pd.concat([content_based_recommends, collaborative_filtering_recommends], ignore_index=True) \
                            .sort_values(by='wr', ascending=False) \
                            .drop_duplicates(['Title'],ignore_index= True)
    
    return hybrid_recommends

In [80]:
get_hybrid_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
0,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.936341
1,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
2,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.60151
3,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
4,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.232876
5,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.853877
6,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.843139
7,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.557166
8,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.534086
9,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468695
