## Collaborative Filtering Approach

### 1. Import Module

In [27]:
import pandas as pd
import numpy as np

### 2. Import Data

In [28]:
# Define column name 
m_cols = ['MovieIDS', 'Title', 'Genre']
r_cols = ['UserID', 'MovieIDs', 'Ratings', 'Timestamp']
u_cols = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# Read csv of movies, ratings and users
df_movies = pd.read_csv('../data/movies.csv', sep='::', names=m_cols, encoding='latin-1', index_col=None, engine='python')
df_ratings = pd.read_csv('../data/ratings.csv', sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
df_users = pd.read_csv('../data/users.csv', sep='::', names=u_cols, encoding='latin-1', index_col=None, engine='python')

In [29]:
# Cek shape dari dataframe
print(f"Shape dari movies : {df_movies.shape}")
print(f"Shape dari rating : {df_ratings.shape}")
print(f"Shape dari user : {df_users.shape}")

Shape dari movies : (3883, 3)
Shape dari rating : (1000209, 4)
Shape dari user : (6040, 5)


### 3. Pengecekan `"df_ratings"`

In [30]:
# Menampilkan df_ratings
df_ratings.head()

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [31]:
df_ratings.describe()

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [32]:
# Drop kolom yang tidak dibutuhkan untuk pemodelan 'Timestamp'
df_ratings = df_ratings.drop('Timestamp', axis=1)

In [33]:
# Cek tipe dari masing masing kolom
df_ratings.dtypes

UserID      int64
MovieIDs    int64
Ratings     int64
dtype: object

Untuk memodelkan rating dengan skala 0-5, diperlukan nilai float untuk mengakomodir angka desimal

In [34]:
# Konversi kolom "Ratings" menjadi float
df_ratings["Ratings"] = df_ratings["Ratings"].astype(float)

Handling terhadap data duplikat

In [35]:
# Menghitung record yang duplikat
df_ratings.duplicated().sum()

0

### 4. Membuat fungsi `import_rating_data`

In [36]:
def import_rating_data(path, frac=0.01):
    """
    Fungsi untuk import rating data, pengecekan terhadap shape, mengganti tipe kolom rating menjadi float, dan duplikasi data

    Parameters
    ----------
    path : str
        Lokasi (path) data ratings disimpan

    Returns
    -------
    rating_df : pandas DataFrame
        Sample dari rating data    
    """
    # Load data
    r_cols = ['UserID', 'MovieIDs', 'Ratings', 'Timestamp']
    df_rating_raw = pd.read_csv(path, sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
    print('Shape data awal :', df_rating_raw.shape)

    # Drop kolom timestamp
    df_rating = df_rating_raw.drop(columns=['Timestamp'], axis=1)
    print('Shape data drop kolom :', df_rating.shape)

    # Typecasting kolom 'Rating' menjadi float
    df_rating["Ratings"] = df_rating["Ratings"].astype(float)

    # Sample movie
    # Mengambil movieID yang unik pada sample data
    unique_movie_id = df_rating['MovieIDs'].sample(frac=0.01)
    sample_data = df_rating.loc[df_rating['MovieIDs'].isin(unique_movie_id)]

    return sample_data


In [37]:
# Import rating data
df_ratings = import_rating_data(path = '../data/ratings.csv')

Shape data awal : (1000209, 4)
Shape data drop kolom : (1000209, 3)


In [38]:
# Cek tabel df_rating
df_ratings.head()

Unnamed: 0,UserID,MovieIDs,Ratings
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


### Pemodelan Recommender System : Collaborative Filterting

#### User to User Collaborative Filtering

In [39]:
# Pivot rating data untuk mendapatkan interaksi rating user dan item
pivot_df_ratings = df_ratings.pivot(index= 'UserID', columns='MovieIDs', values='Ratings')

In [40]:
# Mengecek pivot_df_ratings
pivot_df_ratings.head()

MovieIDs,1,2,3,4,5,6,7,8,10,11,...,3927,3930,3932,3934,3939,3946,3948,3949,3950,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [41]:
# Mengecek shape dari pivot_df_ratings
pivot_df_ratings.shape

(6040, 2297)

In [42]:
# Total rating data null
pivot_df_ratings.isnull().sum().sum()

12937827

#### Membuat fungsi `utility_data_preprocessing`

In [43]:
def utility_data_preprocessing(path):
    """"
    Fungsi untuk melakukan pivot dari df_rating menjadi bentuk utility metrics

    Parameters
    ----------
    path : str
        Lokasi (path) menyimpan rating data

    Returns
    -------
    pivot_df_ratings : pandas DataFrame
        rating data dalam belum tabel yang sudah di pivot
    """
    # Import data

    df_rating = import_rating_data(path)

    # Pivot tabel
    pivot_df_ratings = df_rating.pivot(index= 'UserID', columns='MovieIDs', values='Ratings')

    # Shape data yang telah di pivot
    print('Shaped data setelah di pivot : ', pivot_df_ratings.shape)

    # Menghitung missing values
    print('Jumlah missing values pada utility metrics :', pivot_df_ratings.isnull().sum().sum())

    # Return pivot_df_ratings
    return pivot_df_ratings


In [44]:
# Output fungsi pivot data
pivot_df_ratings = utility_data_preprocessing(path = '../data/ratings.csv')

Shape data awal : (1000209, 4)
Shape data drop kolom : (1000209, 3)
Shaped data setelah di pivot :  (6040, 2243)
Jumlah missing values pada utility metrics 12616533


In [45]:
pivot_df_ratings.head()

MovieIDs,1,2,3,4,5,6,7,9,10,11,...,3930,3932,3933,3937,3942,3943,3946,3948,3949,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


Berikut merupakan fungsi untuk prediksi rating


$$
\begin{align*}
\hat{r_{ui}} = \text{baseline}_{ui} + \frac{\sum_{j \in N(u)} (\text{Similarity}(u,j) \cdot (r_{ji}-\text{baseline}_{ji}) }{\sum_{j \in N(u)} \text{Similarity}(u,j)}
& \\ \\
\text{baseline}_{ui} = \mu + \text{userbias}_{u} + \text{itembias}_{i}
\end{align*}
$$


dengan :    

- $\text{baseline}_{ui}$ : baseline ratings dari user **u** untuk item **i**
- $\hat{r_{ui}}$ : prediksi rating dari user **u** untuk item **i**
- $N(u)$ : Tetangga dari user **u**


$$
\begin{align*}
\text{userbias}_{u} = \mu - \text{user_average}_{u}
& \\ \\
\text{itembias}_{i} = \mu - \text{item_average}_{i}
\end{align*}
$$

with :    

- $\mu$ : global mean
- $\text{user_average}_{u}$ : rata-rata rating dari user **u**
- $\text{item_average}_{i}$ : rata-rata rating dari item **i**


#### Menghitung baseline prediksi

In [46]:
pivot_df_ratings.head()

MovieIDs,1,2,3,4,5,6,7,9,10,11,...,3930,3932,3933,3937,3942,3943,3946,3948,3949,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [52]:
# Menghitung baseline rating pada user 5 dan movie 6
UserID = 5
MovieIDs = 6

# Menghitung Global Mean
global_mean = df_ratings['Ratings'].mean()

# Menghitung User Mean
user_mean = pivot_df_ratings.loc[UserID,:].mean()

# Menghitung Item Mean
item_mean = pivot_df_ratings.loc[:, MovieIDs].mean()

# Output dari baseline
print(f'UserID {UserID} Mean : {round(user_mean,2)}')
print(f'MovieIDs {MovieIDs} Mean : {round(item_mean,2)}')
print(f'Global Mean : {round(global_mean,2)}')

UserID 5 Mean : 3.17
MovieIDs 6 Mean : 3.88
Global Mean : 3.61


In [54]:
# Menghitung bias dari user
bias_user = global_mean - user_mean

# Menghitung bias dari item
bias_item = global_mean - item_mean

# Output dari bias user dan item
print(f'UserID {UserID} Bias : {round(bias_user,2)}')
print(f'MovieIDs {MovieIDs} Bias : {round(bias_item,2)}')

UserID 5 Bias : 0.44
MovieIDs 6 Bias : -0.27


In [55]:
# Menghitung total dari baseline
baseline_user_item = global_mean + bias_user + bias_item

# Output dari total baseline
print(f'Total baseline rating untuk prediksi UserID {UserID} dan MovieIDs {MovieIDs} adalah {round(baseline_user_item,2)}')

Total baseline rating untuk prediksi UserID 5 dan MovieIDs 6 adalah 3.79


#### Membuat fungsi `calculate_baseline_prediction`

In [58]:
def calculate_baseline_prediction(pivot_df_ratings, userid, movieid,
                                  df_ratings=df_ratings):
    """"
    Fungsi untuk menghitung nilai baseline prediksi dari user dan movie

    Parameters
    ---------
    pivot_df_ratings : pandas Dataframe
        DataFrame rating dari user dan movies yang telah dilakukan pivot table

    userid : int
        UserID dari user yang akan dihitung biasnya

    movieid : int
        MovieIDs dari film yang akan dihitung biasnya

    df_ratings : pandas Dataframe
        DataFrame rating user dan film

    Returns
    -------
    baseline_user_item : int
        Nilai baseline terhadap prediksi user dan item
    """
    # Menghitung global mean
    global_mean = df_ratings['Ratings'].mean()

    # Menghitung User Mean
    user_mean = pivot_df_ratings.loc[userid,:].mean()

    # Menghitung Item Mean
    item_mean = pivot_df_ratings.loc[:, movieid].mean()

    # Menghitung bias dari user
    bias_user = global_mean - user_mean

    # Menghitung bias dari item
    bias_item = global_mean - item_mean

    # Menghitung total dari baseline
    baseline_user_item = global_mean + bias_user + bias_item

    return baseline_user_item

In [61]:
# Validasi fungsi terhadap perhitungan semula
baseline_user_item_func = calculate_baseline_prediction(pivot_df_ratings=pivot_df_ratings,
                                                   userid=UserID, movieid=MovieIDs)
# Output dari total baseline
print(f'Total baseline rating untuk prediksi UserID {UserID} dan MovieIDs {MovieIDs} adalah {round(baseline_user_item_func,2)}')

Total baseline rating untuk prediksi UserID 5 dan MovieIDs 6 adalah 3.79


#### Mencari Tetangga Terdekat

In [63]:
# Mencari 5 tetangga terdekat menggunakan Cosine Similarity
user_mean_ = pivot_df_ratings.mean(axis=0)
user_mean_

MovieIDs
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3943    3.052083
3946    2.260000
3948    3.635731
3949    4.115132
3952    3.780928
Length: 2243, dtype: float64

In [65]:
# Kurangi rating dengan user rating 
rating_subtract_user_mean = (pivot_df_ratings - user_mean_).fillna(0)
rating_subtract_user_mean.head()

MovieIDs,1,2,3,4,5,6,7,9,10,11,...,3930,3932,3933,3937,3942,3943,3946,3948,3949,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.853154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,-1.878723,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# Lakukan looping untuk menghitung seluruh similarity
from sklearn.metrics.pairwise import cosine_similarity

# Gunakan progress bar
from tqdm import tqdm

In [67]:
# Menghitung similarity score
n_user = len(rating_subtract_user_mean.index)
similarity_val = np.zeros(n_user)

# Menghitung rating vektor dari user 5
target_user = rating_subtract_user_mean.loc[UserID].values.reshape(1,-1)

# Iterasi terhadap seluruh user
for i, neigh in enumerate(tqdm(rating_subtract_user_mean.index)):
    # Mencari user vektor
    user_neigh = rating_subtract_user_mean.loc[neigh].values.reshape(1,-1)

    # Menghitung similarities (Menggunakan cosine similarity)
    similar_i = cosine_similarity(target_user, user_neigh)

    similarity_val[i] = similar_i

100%|██████████| 6040/6040 [00:01<00:00, 3413.78it/s]


In [68]:
# Urutkan similarity_val secara descending
sort_index = np.argsort(similarity_val)[::-1]

# Jumlah nilai terdekat yang dicari
n = 5

# Mendapatkan nilai terdekat
n_similarity = rating_subtract_user_mean.index[sort_index[1: n+1]]
n_similarity

Int64Index([3821, 1150, 3899, 4161, 4574], dtype='int64', name='UserID')

In [84]:
print(f'Berikut {5} User yang memiliki kesamaan dengan User ID {UserID} :')
for i in range(len(n_similarity)):
    print(f'- User {n_similarity[i]}')

Berikut 5 User yang memiliki kesamaan dengan User ID 5 :
- User 3821
- User 1150
- User 3899
- User 4161
- User 4574


#### Membuat fungsi `get_n_neigh`

In [87]:
def get_n_neigh(rating_subtract_user_mean, userid, n=5):
    """
    Fungsi yang digunakan untuk mencari n user dengan nilai terdekat

    Parameters
    ----------
    rating_subtract_user_mean : pandas DataFrame,
        Rating dari user yang telah dikurangi dengan bias dari user
    
    userid : int,
        ID dari user pada utility matrix    

    n : int,
        Jumlah n user yang nilainya paling dekat dengan userid

    Returns
    -------
    n_similarities : dict,
        n_similarity --> Berisi n user dengan id termirip dengan userid berdasarkan rating
        similarity_val --> Nilai rating dari n user tersebut
    """
    # Menghitung similarity score
    n_user = len(rating_subtract_user_mean.index)
    similarity_val = np.zeros(n_user)

    # Menghitung rating vektor dari user 5
    target_user = rating_subtract_user_mean.loc[userid].values.reshape(1,-1)

    # Iterasi terhadap seluruh user
    for i, neigh in enumerate(tqdm(rating_subtract_user_mean.index)):
        # Mencari user vektor
        user_neigh = rating_subtract_user_mean.loc[neigh].values.reshape(1,-1)

        # Menghitung similarities (Menggunakan cosine similarity)
        similar_i = cosine_similarity(target_user, user_neigh)

        similarity_val[i] = similar_i

    # Urutkan similarity_val secara descending
    sort_index = np.argsort(similarity_val)[::-1]

    # Urutkan similarity val secara descending
    similarity_val = np.sort(similarity_val)[::-1]

    # Mendapatkan nilai terdekat
    n_similarity = rating_subtract_user_mean.index[sort_index[1: n+1]].tolist()

    # Mendapatkan n_similarity_val
    n_similarity_val = list(similarity_val[1:n+1])

    # Buat dictionary n_similarities
    n_similarities = {
            'n_similarity' : n_similarity,
            'n_similarity_val' : n_similarity_val
        }

    # Return n_similarities
    return n_similarities
 

In [91]:
list_neigh = get_n_neigh(rating_subtract_user_mean= rating_subtract_user_mean, userid= 5, n=5)
list_neigh

100%|██████████| 6040/6040 [00:01<00:00, 3247.45it/s]


{'n_similarity': [3821, 1150, 3899, 4161, 4574],
 'n_similarity_val': [0.17973251808242668,
  0.17289586872650187,
  0.17145119673589823,
  0.16858456566451308,
  0.16706617376309474]}

#### Melakukan Prediksi Rating pada Utility Matrix

In [90]:
pivot_df_ratings.head()

MovieIDs,1,2,3,4,5,6,7,9,10,11,...,3930,3932,3933,3937,3942,3943,3946,3948,3949,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [95]:
# Menghitung baseline dari u, j
# Definisikan placeholder
n = 5
total_rating_sim = 0
sim_sum = 0

# Iterasi untuk seluruh n user
for i in range(n):
    # Mendapatkan rating dari n-user
    neigh_rating = pivot_df_ratings.loc[list_neigh['n_similarity'][i],6]
    print(neigh_rating)

    # Abaikan nilai NaN
    if np.isnan(neigh_rating):
        continue

    # Menghitung nilai baseline (j, i)
    baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                             userid = list_neigh['n_similarity'][i], movieid=6)
    
    # Kurangi baseline dengan rating
    substracted_rating = neigh_rating - baseline

    # Kalikan dengan similarity
    rating_sim = list_neigh['n_similarity_val'][i]*substracted_rating

    # Jumlah similarity dikalikan dengan rating
    total_rating_sim += rating_sim

    # Jumlah total similarity
    sim_sum += list_neigh['n_similarity_val'][i]

rating_user_item_predicted = baseline + (total_rating_sim / sim_sum)

print(f'Prediksi rating untuk UserID {UserID}, dan MovieID {MovieIDs} adalah {round(rating_user_item_predicted,2)}')


3.0
2.0
nan
3.0
nan
Prediksi rating untuk UserID 5, dan MovieID 6 adalah 2.86


#### Membuat fungsi `rating_predicted_item`

In [101]:
def rating_predicted_item(userid, movieid, pivot_df_ratings, list_neigh, n,
                          max_rating = 5, min_rating = 1):
    """
    Fungsi untuk melakukan prediksi rating pada UserID dan MoviesIDs

    Parameters
    ---------
    userid : int,
        ID dari user pada utility matrix

    movieid : int,
        ID dari movie pada utility matrix

    pivot_df_ratings : pandas DataFrame,
        DataFrame rating dari user dan movies yang telah dilakukan pivot table

    list_neigh : list,
        n user terdekat dengan userid untuk rating movieid

    n : int,
        Jumlah n tetangga terdekat dengan userid

    max_rating : int (default=5),
        Rating maksimal yang dapat diberikan oleh User
    
    min_rating : int (default=1),
        Rating minimal yang dapati diberikan oleh User

    Returns
    -------
    rating_user_item_predicted : int,
        Prediksi rating dari movie pada userid dan movie tersebut   
    """
    # Menghapus user mean pada rating
    user_mean_ = pivot_df_ratings.mean(axis=0)
    rating_subtract_user_mean = (pivot_df_ratings - user_mean_).fillna(0)

    # Menghitung baseline (u, i)
    baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                            userid = userid, movieid= movieid)
    
    total_rating_sim = 0
    sim_sum = 0
    # Iterasi untuk mendapatkan jumlah rating
    for i in range(n):
    # Mendapatkan rating dari n-user
        neigh_rating = pivot_df_ratings.loc[list_neigh['n_similarity'][i],movieid]
        print(neigh_rating)

        # Abaikan nilai NaN
        if np.isnan(neigh_rating):
            continue

        # Menghitung nilai baseline (j, i)
        baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                                userid = list_neigh['n_similarity'][i], movieid=movieid)
        
        # Kurangi baseline dengan rating
        substracted_rating = neigh_rating - baseline

        # Kalikan dengan similarity
        rating_sim = list_neigh['n_similarity_val'][i]*substracted_rating

        # Jumlah similarity dikalikan dengan rating
        total_rating_sim += rating_sim

        # Jumlah total similarity
        sim_sum += list_neigh['n_similarity_val'][i]
    
    # Defence terhadap ZeroDivisionError
    try :
        rating_user_item_predicted = baseline + (total_rating_sim / sim_sum)

    except ZeroDivisionError:
        rating_user_item_predicted = baseline

    # Cek terhadap batas dari rating (max_rating / min_rating)
    if rating_user_item_predicted > max_rating:
        rating_user_item_predicted = max_rating

    elif rating_user_item_predicted < min_rating:
        rating_user_item_predicted = min_rating

    return round(rating_user_item_predicted,2)

In [102]:
# Melakukan pengecekan fungsi `rating_predicted_item`
rating_predicted_item(userid = 5, movieid = 6,
                      pivot_df_ratings = pivot_df_ratings,
                      list_neigh = list_neigh, n = 5)

3.0
2.0
nan
3.0
nan


2.86

#### Generate Recommendation

In [106]:
user_id = 1
# Membuat placeholder (dataframe) untuk hasil prediksi
df_pred = pd.DataFrame()

# Membuat placeholder (list) untuk hasil prediksi
pred_rating = []

# Iterasi untuk semua movie unrated
masked = np.isnan(pivot_df_ratings.loc[user_id])
pivot_df_ratings

for mov in tqdm(pivot_df_ratings.columns[masked]):
    # Prediksi rating movie
    predictions = rating_predicted_item(userid= user_id, movieid= mov,
                                        pivot_df_ratings= pivot_df_ratings,
                                        list_neigh= list_neigh, n = n)
    
    # Tambahkan rating ke placeholder
    pred_rating.append(predictions)

# Assign ke MovieID
df_pred['MovieIDs'] = pivot_df_ratings.columns[masked]

# Assign hasil dari prediksi
df_pred['pred_rating'] = pred_rating


  0%|          | 1/2191 [00:00<23:21,  1.56it/s]

2.0
nan
nan
nan
nan


  0%|          | 2/2191 [00:01<23:36,  1.55it/s]

nan
nan
nan
nan
nan


  0%|          | 3/2191 [00:01<23:47,  1.53it/s]

nan
2.0
nan
nan
nan


  0%|          | 4/2191 [00:02<23:43,  1.54it/s]

nan
nan
nan
nan
nan


  0%|          | 5/2191 [00:03<24:21,  1.50it/s]

3.0
2.0
nan
3.0
nan


  0%|          | 6/2191 [00:03<24:06,  1.51it/s]

nan
nan
nan
nan
nan


  0%|          | 7/2191 [00:04<23:57,  1.52it/s]

nan
nan
nan
nan
nan


  0%|          | 8/2191 [00:05<23:51,  1.52it/s]

nan
2.0
nan
2.0
nan


  0%|          | 9/2191 [00:05<23:46,  1.53it/s]

nan
nan
nan
nan
nan


  0%|          | 10/2191 [00:06<23:48,  1.53it/s]

nan
nan
nan
nan
nan


  1%|          | 11/2191 [00:07<23:39,  1.54it/s]

nan
nan
nan
nan
nan


  1%|          | 12/2191 [00:07<23:38,  1.54it/s]

nan
nan
nan
nan
nan


  1%|          | 13/2191 [00:08<23:36,  1.54it/s]

nan
nan
nan
1.0
nan


  1%|          | 14/2191 [00:09<23:30,  1.54it/s]

3.0
3.0
nan
nan
nan


  1%|          | 15/2191 [00:09<23:41,  1.53it/s]

nan
2.0
nan
nan
nan


  1%|          | 16/2191 [00:10<24:11,  1.50it/s]

2.0
3.0
nan
nan
nan


  1%|          | 17/2191 [00:11<23:54,  1.52it/s]

nan
nan
nan
nan
nan


  1%|          | 18/2191 [00:11<23:53,  1.52it/s]

nan
nan
nan
1.0
nan


  1%|          | 19/2191 [00:12<23:52,  1.52it/s]

2.0
2.0
nan
3.0
nan


  1%|          | 20/2191 [00:13<23:43,  1.53it/s]

nan
3.0
nan
nan
nan


  1%|          | 21/2191 [00:13<23:55,  1.51it/s]

nan
nan
nan
nan
nan


  1%|          | 22/2191 [00:14<23:41,  1.53it/s]

nan
nan
nan
nan
nan


  1%|          | 23/2191 [00:15<23:37,  1.53it/s]

3.0
3.0
nan
nan
nan


  1%|          | 24/2191 [00:15<23:29,  1.54it/s]

nan
5.0
nan
nan
nan


  1%|          | 25/2191 [00:16<23:23,  1.54it/s]

nan
1.0
nan
nan
nan


  1%|          | 26/2191 [00:17<23:56,  1.51it/s]

2.0
4.0
nan
nan
nan


  1%|          | 27/2191 [00:17<23:48,  1.51it/s]

nan
1.0
nan
nan
nan


  1%|▏         | 28/2191 [00:18<23:42,  1.52it/s]

3.0
3.0
nan
5.0
3.0


  1%|▏         | 29/2191 [00:19<23:33,  1.53it/s]

3.0
1.0
nan
nan
nan


  1%|▏         | 30/2191 [00:19<23:35,  1.53it/s]

nan
4.0
nan
nan
nan


  1%|▏         | 31/2191 [00:20<23:31,  1.53it/s]

nan
nan
nan
3.0
nan


  1%|▏         | 32/2191 [00:20<23:31,  1.53it/s]

nan
2.0
3.0
nan
nan


  2%|▏         | 33/2191 [00:21<23:25,  1.54it/s]

nan
nan
nan
nan
nan


  2%|▏         | 34/2191 [00:22<23:24,  1.54it/s]

nan
4.0
nan
nan
nan


  2%|▏         | 35/2191 [00:22<23:44,  1.51it/s]

3.0
3.0
nan
nan
nan


  2%|▏         | 36/2191 [00:23<23:38,  1.52it/s]

3.0
4.0
nan
nan
4.0


  2%|▏         | 36/2191 [00:24<24:04,  1.49it/s]


KeyboardInterrupt: 