🎬 Content-Based Movie Recommender (Eq. 3 – Cosine Similarity)

In [1]:
import pandas as pd
import numpy as np
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OKTAVIAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OKTAVIAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 🔄 Load Data seluruh film
movies = pd.read_csv('../Dataset/movies_metadata.csv', low_memory=False, on_bad_lines='skip')
credits = pd.read_csv('../Dataset/credits.csv', on_bad_lines='skip')

movies = movies[['id', 'title', 'genres', 'overview']]
credits = credits[['id', 'cast', 'crew']]
credits['id'] = credits['id'].astype(str)
movies['id'] = movies['id'].astype(str)
df = pd.merge(movies, credits, on='id')
df = df.reset_index(drop=True)

In [3]:
df

Unnamed: 0,id,title,genres,overview,cast,crew
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...
45533,439050,Subdue,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Rising and falling between a man and woman.,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de..."
45534,111109,Century of Birthing,"[{'id': 18, 'name': 'Drama'}]",An artist struggles to finish his work while a...,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de..."
45535,67758,Betrayal,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...","[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de..."
45536,227506,Satan Triumphant,[],"In a small town live two brothers, one a minis...","[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de..."


In [4]:
# Ekstraksi fitur
def parse_genres(x):
    try:
        return [d['name'].lower() for d in ast.literal_eval(x)]
    except:
        return []

def extract_cast(x):
    try:
        return [d['name'].lower() for d in ast.literal_eval(x)[:5]]
    except:
        return []

def extract_director(x):
    try:
        return [d['name'].lower() for d in ast.literal_eval(x) if d['job'].lower() == 'director']
    except:
        return []

df['genres'] = df['genres'].apply(parse_genres)
df['actors'] = df['cast'].apply(extract_cast)
df['director'] = df['crew'].apply(extract_director)

In [5]:
# 🔠 Preprocessing Overview dan Title
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return [stemmer.stem(t) for t in tokens]

df['processed_overview'] = df['overview'].apply(preprocess)
df['processed_title'] = df['title'].apply(preprocess)
df['combined_features'] = df['processed_title'] + df['genres'] + df['actors'] + df['director']

In [6]:
df['processed_overview']

0        [led, woodi, andi, toy, live, happili, room, a...
1        [sibl, judi, peter, discov, enchant, board, ga...
2        [famili, wed, reignit, ancient, feud, nextdoor...
3        [cheat, mistreat, step, women, hold, breath, w...
4        [georg, bank, recov, daughter, wed, receiv, ne...
                               ...                        
45533                             [rise, fall, man, woman]
45534    [artist, struggl, finish, work, storylin, cult...
45535    [one, hit, goe, wrong, profession, assassin, e...
45536    [small, town, live, two, brother, one, minist,...
45537    [50, year, decriminalis, homosexu, uk, directo...
Name: processed_overview, Length: 45538, dtype: object

In [7]:
# df['processed_overview'].to_csv('../Dataset/processed_overview.csv', index=False)

In [8]:
w2v_model = Word2Vec(df['processed_overview'], vector_size=300, window=5, min_count=1, workers=4)

def vectorize(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['overview_vector'] = df['processed_overview'].apply(vectorize)


In [9]:
df['overview_vector']

0        [0.0037347148, 0.39871886, 0.012044003, 0.1057...
1        [-0.05740323, 0.52452916, 0.012260048, 0.12117...
2        [-0.04186253, 0.3692822, 0.021012068, 0.080965...
3        [-0.077397, 0.3663318, 0.07392146, -0.02218168...
4        [-0.0825778, 0.40412983, 0.14116475, 0.0680067...
                               ...                        
45533    [-0.43334383, 0.3011059, 0.28941178, 0.5905124...
45534    [0.07383479, 0.14254008, 0.044676155, 0.204063...
45535    [-0.20957623, 0.6073796, 0.25488672, -0.134353...
45536    [-0.016793976, 0.43047723, 0.013039132, 0.2913...
45537    [0.14799684, 0.445178, 0.014537633, 0.19494481...
Name: overview_vector, Length: 45538, dtype: object

In [10]:
# Menampilkan mean vector (Word2Vec) untuk setiap film (overview)
mean_vectors = pd.DataFrame(df['overview_vector'].to_list(), index=df['title'])
print('Mean vector (Word2Vec) untuk setiap film:')
print(mean_vectors)

Mean vector (Word2Vec) untuk setiap film:
                                  0         1         2         3         4    \
title                                                                           
Toy Story                    0.003735  0.398719  0.012044  0.105765  0.018964   
Jumanji                     -0.057403  0.524529  0.012260  0.121170  0.150143   
Grumpier Old Men            -0.041863  0.369282  0.021012  0.080966  0.079218   
Waiting to Exhale           -0.077397  0.366332  0.073921 -0.022182  0.220247   
Father of the Bride Part II -0.082578  0.404130  0.141165  0.068007  0.174480   
...                               ...       ...       ...       ...       ...   
Subdue                      -0.433344  0.301106  0.289412  0.590512  0.160068   
Century of Birthing          0.073835  0.142540  0.044676  0.204063  0.046037   
Betrayal                    -0.209576  0.607380  0.254887 -0.134354  0.432750   
Satan Triumphant            -0.016794  0.430477  0.013039  0.291327

In [11]:
# Menampilkan vektor Word2Vec untuk setiap kata unik di overview 7 film
all_words = set([word for doc in df['processed_overview'] for word in doc])
print("Jumlah kata unik di overview:", len(all_words))

Jumlah kata unik di overview: 68500


In [12]:
# df_kata_unik = pd.DataFrame(all_words)
# df_kata_unik.to_csv('../Dataset/unique_words_overview.csv', index=False, header=False)

In [13]:
# Output DataFrame vektor kata Word2Vec untuk semua kata unik di overview 7 film
word_vectors = {word: w2v_model.wv[word] for word in all_words if word in w2v_model.wv}
df_vectors = pd.DataFrame(word_vectors).T  # index=kata, kolom=dimensi vektor
df_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
murdera,-0.001492,0.007752,0.000944,0.000670,0.005468,-0.003319,0.005239,0.006286,0.000968,-0.005628,...,0.003755,-0.000645,-0.005542,0.005985,-0.005743,-0.000265,-0.001756,0.004602,0.000254,-0.002043
marwan,0.009618,0.000504,-0.003979,0.006018,0.001273,-0.001563,-0.001576,0.021422,0.001248,0.004228,...,0.005302,0.013235,0.015307,0.007718,0.013226,0.014277,-0.003418,-0.015788,0.006392,0.001662
fishermen,0.018783,0.126734,-0.003339,0.039937,0.000645,-0.128714,0.051017,0.215781,0.049296,-0.016886,...,0.019514,0.103152,0.099662,0.035150,0.073800,0.105720,0.042226,-0.009587,0.006408,0.040192
interfer,0.002043,0.140704,0.008855,0.051198,0.048027,-0.220244,0.087263,0.316866,-0.025008,-0.126730,...,0.071118,0.115416,0.154375,0.001561,0.110115,0.207278,0.025367,-0.102069,0.048237,0.056781
möter,0.004209,0.011069,0.006783,0.003420,0.002491,-0.013261,0.001609,0.022725,0.007637,-0.004999,...,0.004371,0.010220,0.009756,0.003305,0.005500,0.006963,-0.003606,-0.006366,-0.001636,-0.006115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
towm,0.002109,-0.002285,0.001958,-0.002562,0.003866,-0.000555,-0.004622,0.002324,-0.003074,0.001638,...,-0.000232,0.000058,0.004799,-0.006225,0.001177,0.002457,-0.002065,-0.005841,0.001842,0.004026
sleight,-0.008451,-0.005276,0.004534,0.004324,0.000439,-0.005696,0.013565,0.021565,-0.000305,-0.019065,...,-0.001798,0.017491,0.013580,-0.021294,0.004972,0.030600,0.011195,-0.015813,0.007211,0.010653
droomt,0.004829,0.011508,-0.002292,0.002031,0.000328,-0.009746,0.003439,0.024589,0.006408,-0.002166,...,-0.000602,0.019128,0.018103,0.006131,0.006411,0.014215,-0.003312,-0.003172,0.006821,-0.001687
batala,-0.000973,0.017137,0.002877,0.001646,0.002793,-0.016555,0.014940,0.036826,0.013968,0.008931,...,0.009681,0.014323,0.011921,-0.011813,0.014894,-0.001676,0.007269,-0.015582,0.015526,0.008565


In [14]:
# df_vectors.to_csv('../Dataset/word_vectors_overview.csv', index=True, header=True)

In [15]:
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    return len(set1 & set2) / len(set1 | set2) if set1 or set2 else 0.0


In [16]:
def recommend(title_input, top_n=10,
              alpha=1, beta=1, gamma=1, theta=1, delta=1):
    """
    Sistem rekomendasi menggunakan cosine similarity (Eq. 3) untuk overview.
    """
    title_input = title_input.lower()
    if title_input not in df['title'].str.lower().values:
        print(f"Film '{title_input}' tidak ditemukan.")
        return []

    idx = df[df['title'].str.lower() == title_input].index[0]

    # Fitur target
    genre_i     = df.loc[idx, 'genres']
    actors_i    = df.loc[idx, 'actors']
    director_i  = df.loc[idx, 'director']
    title_i     = df.loc[idx, 'processed_title']
    overview_i  = df.loc[idx, 'overview_vector'].reshape(1, -1)

    results = []

    print(f"\n📊 Menghitung kemiripan dengan '{df.loc[idx, 'title']}':")

    for j in range(len(df)):
        if j == idx:
            continue

        genre_sim    = jaccard_similarity(genre_i, df.loc[j, 'genres'])
        actor_sim    = jaccard_similarity(actors_i, df.loc[j, 'actors'])
        director_sim = jaccard_similarity(director_i, df.loc[j, 'director'])
        title_sim    = jaccard_similarity(title_i, df.loc[j, 'processed_title'])

        overview_j = df.loc[j, 'overview_vector'].reshape(1, -1)
        overview_sim = cosine_similarity(overview_i, overview_j)[0][0]

        final_score = (
            (alpha * genre_sim +
            beta * actor_sim +
            gamma * director_sim +
            theta * title_sim +
            delta * overview_sim) / 5
        )

        print(f"- {df.loc[j, 'title']}:\n  Genre={genre_sim:.3f}, Actor={actor_sim:.3f}, Director={director_sim:.3f}, Title={title_sim:.3f}, Overview={overview_sim:.3f}")
        print(f"  → Final Score: {final_score:.3f}\n")

        results.append((j, final_score))

    top = sorted(results, key=lambda x: x[1], reverse=True)[:top_n]
    return [(df.loc[i, 'title'], round(score, 3)) for i, score in top]


In [17]:
hasil = recommend("Toy Story", top_n=10)

print("\n🎬 Top 10 Rekomendasi (menggunakan Eq. 3 – cosine):")
for title, sim in hasil:
    print(f"- {title} (Similarity: {sim})")



📊 Menghitung kemiripan dengan 'Toy Story':
- Jumanji:
  Genre=0.200, Actor=0.000, Director=0.000, Title=0.000, Overview=0.960
  → Final Score: 0.232

- Grumpier Old Men:
  Genre=0.250, Actor=0.000, Director=0.000, Title=0.000, Overview=0.940
  → Final Score: 0.238

- Waiting to Exhale:
  Genre=0.200, Actor=0.000, Director=0.000, Title=0.000, Overview=0.939
  → Final Score: 0.228

- Father of the Bride Part II:
  Genre=0.333, Actor=0.000, Director=0.000, Title=0.000, Overview=0.907
  → Final Score: 0.248

- Heat:
  Genre=0.000, Actor=0.000, Director=0.000, Title=0.000, Overview=0.943
  → Final Score: 0.189

- Sabrina:
  Genre=0.250, Actor=0.000, Director=0.000, Title=0.000, Overview=0.930
  → Final Score: 0.236

- Tom and Huck:
  Genre=0.167, Actor=0.000, Director=0.000, Title=0.000, Overview=0.919
  → Final Score: 0.217

- Sudden Death:
  Genre=0.000, Actor=0.000, Director=0.000, Title=0.000, Overview=0.926
  → Final Score: 0.185

- GoldenEye:
  Genre=0.000, Actor=0.000, Director=0.00