In [2]:
import pandas as pd
import zipfile
import urllib.request
import os

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# MovieLens 100k
url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
# Nama file 
zip_filename = 'movielens.zip'
# Folder tujuan untuk mengekstrak data
extract_folder = 'data'

# Membuat folder 'data' jika belum ada
if not os.path.exists(extract_folder):
    os.makedirs(extract_folder)
    print(f"Folder '{extract_folder}' created.")

# Mengunduh file jika belum ada
if not os.path.exists(zip_filename):
    print(f"Downloading dataset from {url}...")
    urllib.request.urlretrieve(url, zip_filename)
    print("Download complete.")
else:
    print("Dataset zip file already exists.")

Folder 'data' created.
Downloading dataset from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip...
Download complete.


In [4]:
# Ekstrak file zip
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)
    print(f"Files extracted to '{extract_folder}' folder.")
    
# Dataset diekstrak ke dalam subfolder, kita perlu path lengkapnya
data_path = os.path.join(extract_folder, 'ml-latest-small')
print(f"Data path: {data_path}")

Files extracted to 'data' folder.
Data path: data\ml-latest-small


In [5]:
# Memuat data ratings dan movies
ratings = pd.read_csv(os.path.join(data_path, 'ratings.csv'))
movies = pd.read_csv(os.path.join(data_path, 'movies.csv'))

print("Data loaded successfully!")

Data loaded successfully!


In [6]:
# Menampilkan 5 baris pertama dari tabel ratings
print("Tabel Ratings:")
ratings.head()

Tabel Ratings:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Menampilkan 5 baris pertama dari tabel movies
print("\nTabel Movies:")
movies.head()


Tabel Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# Menghitung jumlah pengguna dan film unik
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
n_ratings = len(ratings)

print(f"Jumlah User Unik: {n_users}")
print(f"Jumlah Film Unik: {n_movies}")
print(f"Jumlah Total Rating: {n_ratings}")
print(f"Rata-rata Rating per User: {round(n_ratings/n_users, 2)}")
print(f"Rata-rata Rating per Film: {round(n_ratings/n_movies, 2)}")

Jumlah User Unik: 610
Jumlah Film Unik: 9724
Jumlah Total Rating: 100836
Rata-rata Rating per User: 165.3
Rata-rata Rating per Film: 10.37


In [9]:
# Menggabungkan dataframe ratings dan movies berdasarkan 'movieId'
df = pd.merge(ratings, movies, on='movieId')

# Tampilkan 5 baris pertama dari dataframe yang sudah digabung
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [10]:
# Menghitung jumlah rating untuk setiap film dan mengurutkannya
most_rated = df.groupby('title')['rating'].count().sort_values(ascending=False)

# Menampilkan 10 film dengan rating terbanyak
print("10 Film Paling Banyak Diberi Rating:")
most_rated.head(10)

10 Film Paling Banyak Diberi Rating:


title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
Name: rating, dtype: int64

In [11]:
# Menghitung rata-rata rating untuk setiap film dan mengurutkannya
average_rating = df.groupby('title')['rating'].mean().sort_values(ascending=False)

# Menampilkan 10 film dengan rata-rata rating tertinggi
print("10 Film dengan Rata-rata Rating Tertinggi:")
average_rating.head(10)

10 Film dengan Rata-rata Rating Tertinggi:


title
Karlson Returns (1970)                                                         5.0
Zeitgeist: Moving Forward (2011)                                               5.0
Dream of Light (a.k.a. Quince Tree Sun, The) (Sol del membrillo, El) (1992)    5.0
Dragons: Gift of the Night Fury (2011)                                         5.0
12 Angry Men (1997)                                                            5.0
Justice League: Doom (2012)                                                    5.0
Junior and Karlson (1968)                                                      5.0
Jump In! (2007)                                                                5.0
Human Condition III, The (Ningen no joken III) (1961)                          5.0
Louis Theroux: Law & Disorder (2008)                                           5.0
Name: rating, dtype: float64

In [12]:
# Membuat dataframe baru yang berisi rata-rata rating dan jumlah rating
ratings_mean_count = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(df.groupby('title')['rating'].count())

# Filter film dengan jumlah rating > 50, lalu urutkan berdasarkan rata-rata rating
min_ratings = 50
best_movies = ratings_mean_count[ratings_mean_count['rating_counts'] > min_ratings].sort_values('rating', ascending=False)

print(f"Film Terbaik (dengan lebih dari {min_ratings} ratings):")
best_movies.head(10)

Film Terbaik (dengan lebih dari 50 ratings):


Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",4.429022,317
"Godfather, The (1972)",4.289062,192
Fight Club (1999),4.272936,218
Cool Hand Luke (1967),4.27193,57
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),4.268041,97
Rear Window (1954),4.261905,84
"Godfather: Part II, The (1974)",4.25969,129
"Departed, The (2006)",4.252336,107
Goodfellas (1990),4.25,126
Casablanca (1942),4.24,100


In [13]:
# Membuat user-item matrix menggunakan pivot_table
# index: baris (pengguna)
# columns: kolom (judul film)
# values: nilai di dalam sel (rating)
user_item_matrix = df.pivot_table(index='userId', columns='title', values='rating')

# Tampilkan beberapa baris dan kolom pertama dari matrix
user_item_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Memeriksa dimensi dari matrix
print(f"Bentuk dari User-Item Matrix: {user_item_matrix.shape}")

Bentuk dari User-Item Matrix: (610, 9719)


In [15]:
# Mengisi nilai NaN (Not a Number) dengan 0
user_item_matrix_filled = user_item_matrix.fillna(0)

# Tampilkan kembali matrix yang sudah diisi
user_item_matrix_filled.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Menghitung cosine similarity antar pengguna
# Inputnya adalah matrix kita, di mana setiap baris adalah seorang pengguna
user_similarity = cosine_similarity(user_item_matrix_filled)

# Hasilnya adalah sebuah matrix persegi (user x user)
# Mari kita ubah menjadi DataFrame agar lebih mudah dibaca
user_similarity_df = pd.DataFrame(user_similarity,
                                  index=user_item_matrix_filled.index,
                                  columns=user_item_matrix_filled.index)

# Tampilkan matriks kesamaan untuk 5 pengguna pertama
print("User Similarity Matrix:")
user_similarity_df.head()

User Similarity Matrix:


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [18]:
def get_recommendations(user_id, num_recommendations=10):
    """
    Memberikan rekomendasi film untuk seorang pengguna berdasarkan pengguna lain yang mirip.
    
    Parameters:
    - user_id (int): ID dari pengguna yang ingin diberi rekomendasi.
    - num_recommendations (int): Jumlah film yang ingin direkomendasikan.
    
    Returns:
    - a pandas DataFrame containing recommended movies.
    """
    print(f"Mencari rekomendasi untuk User ID: {user_id}...")
    
    # 1. Temukan pengguna yang paling mirip
    # Ambil baris dari user_id di matriks kesamaan, urutkan, dan buang pengguna itu sendiri (skor 1.0)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).drop(user_id).head(num_recommendations)
    
    # 2. Ambil daftar film yang sudah ditonton oleh pengguna target
    # Kita menggunakan user_item_matrix (sebelum diisi 0) karena nilai 0 berarti belum ditonton
    watched_movies_target = user_item_matrix.loc[user_id]
    watched_movies_target = watched_movies_target[watched_movies_target > 0].index
    
    # 3. Kumpulkan film dari pengguna-pengguna yang mirip
    recommendations = set()
    for similar_user_id in similar_users.index:
        # Ambil film yang sudah ditonton oleh pengguna mirip
        watched_movies_similar = user_item_matrix.loc[similar_user_id]
        # Pilih film yang dirating tinggi (misal > 3.5) oleh pengguna mirip
        recommended_for_target = watched_movies_similar[watched_movies_similar > 3.5].index
        # Tambahkan ke dalam set rekomendasi
        recommendations.update(recommended_for_target)
        
    # 4. Buang film yang sudah pernah ditonton oleh pengguna target
    final_recommendations = recommendations.difference(watched_movies_target)
    
    # 5. Mengambil detail (judul, genre) dan mengembalikan top-N rekomendasi
    top_recommendations = movies[movies['title'].isin(list(final_recommendations))].head(num_recommendations)
    
    return top_recommendations

In [19]:
# Uji coba fungsi untuk user ID 1
rekomendasi_untuk_user_1 = get_recommendations(1)
rekomendasi_untuk_user_1

Mencari rekomendasi untuk User ID: 1...


Unnamed: 0,movieId,title,genres
9,10,GoldenEye (1995),Action|Adventure|Thriller
15,16,Casino (1995),Crime|Drama
18,19,Ace Ventura: When Nature Calls (1995),Comedy
20,21,Get Shorty (1995),Comedy|Crime|Thriller
28,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
32,34,Babe (1995),Children|Drama
35,39,Clueless (1995),Comedy|Romance
37,41,Richard III (1995),Drama|War
40,44,Mortal Kombat (1995),Action|Adventure|Fantasy


In [20]:
# Membuat folder 'saved_model' jika belum ada
if not os.path.exists('saved_model'):
    os.makedirs('saved_model')
    print("Folder 'saved_model' created.")

# Menyimpan dataframe yang dibutuhkan
movies.to_csv('saved_model/movies_cleaned.csv', index=False)
user_item_matrix.to_csv('saved_model/user_item_matrix.csv')
user_similarity_df.to_csv('saved_model/user_similarity.csv')

print("DataFrames berhasil disimpan!")

Folder 'saved_model' created.
DataFrames berhasil disimpan!


In [22]:
# Transpose user_item_matrix agar film menjadi baris dan pengguna menjadi kolom
item_user_matrix = user_item_matrix_filled.T

# Hitung cosine similarity antar film (item)
item_similarity = cosine_similarity(item_user_matrix)

# Ubah menjadi DataFrame agar mudah dibaca
item_similarity_df = pd.DataFrame(item_similarity,
                                  index=item_user_matrix.index,
                                  columns=item_user_matrix.index)

# Simpan matriks ini untuk digunakan di aplikasi
item_similarity_df.to_csv('saved_model/item_similarity.csv')

print("Item-Item Similarity Matrix berhasil dibuat dan disimpan!")
item_similarity_df.head()

Item-Item Similarity Matrix berhasil dibuat dan disimpan!


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,1.0,0.857493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.857493,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
