## Collaborative Filtering Approach

### 1. Import Module

In [65]:
import pandas as pd
import numpy as np

### 2. Import Data

In [66]:
# Define column name 
m_cols = ['MovieIDS', 'Title', 'Genre']
r_cols = ['UserID', 'MovieIDs', 'Ratings', 'Timestamp']
u_cols = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# Read csv of movies, ratings and users
df_movies = pd.read_csv('../data/movies.csv', sep='::', names=m_cols, encoding='latin-1', index_col=None, engine='python')
df_ratings = pd.read_csv('../data/ratings.csv', sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
df_users = pd.read_csv('../data/users.csv', sep='::', names=u_cols, encoding='latin-1', index_col=None, engine='python')

In [67]:
# Cek shape dari dataframe
print(f"Shape dari movies : {df_movies.shape}")
print(f"Shape dari rating : {df_ratings.shape}")
print(f"Shape dari user : {df_users.shape}")

Shape dari movies : (3883, 3)
Shape dari rating : (1000209, 4)
Shape dari user : (6040, 5)


### 3. Pengecekan `"df_ratings"`

In [68]:
# Menampilkan df_ratings
df_ratings.head()

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [69]:
df_ratings.describe()

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [70]:
# Drop kolom yang tidak dibutuhkan untuk pemodelan 'Timestamp'
df_ratings = df_ratings.drop('Timestamp', axis=1)

In [71]:
# Cek tipe dari masing masing kolom
df_ratings.dtypes

UserID      int64
MovieIDs    int64
Ratings     int64
dtype: object

Untuk memodelkan rating dengan skala 0-5, diperlukan nilai float untuk mengakomodir angka desimal

In [72]:
# Konversi kolom "Ratings" menjadi float
df_ratings["Ratings"] = df_ratings["Ratings"].astype(float)

Handling terhadap data duplikat

In [87]:
# Menghitung record yang duplikat
df_ratings.duplicated().sum()

0

### 4. Membuat fungsi `import_rating_data`

In [88]:
def import_rating_data(path, frac=5e-5):
    """
    Fungsi untuk import rating data, pengecekan terhadap shape, mengganti tipe kolom rating menjadi float, dan duplikasi data

    Parameters
    ----------
    path : str
        Lokasi (path) data ratings disimpan

    Returns
    -------
    rating_df : pandas DataFrame
        Sample dari rating data    
    """
    # Load data
    r_cols = ['UserID', 'MovieIDs', 'Ratings', 'Timestamp']
    df_rating_raw = pd.read_csv(path, sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
    print('Shape data awal :', df_rating_raw.shape)

    # Drop kolom timestamp
    df_rating = df_rating_raw.drop(columns=['Timestamp'], axis=1)
    print('Shape data drop kolom :', df_rating.shape)

    # Typecasting kolom 'Rating' menjadi float
    df_rating["Ratings"] = df_rating["Ratings"].astype(float)

    # Sample movie
    # Mengambil movieID yang unik pada sample data
    unique_movie_id = df_rating['MovieIDs'].sample(frac=5e-5)
    sample_data = df_rating.loc[df_rating['MovieIDs'].isin(unique_movie_id)]

    return sample_data


In [89]:
# Import rating data
df_ratings = import_rating_data(path = '../data/ratings.csv')

Shape data awal : (1000209, 4)
Shape data drop kolom : (1000209, 3)


In [90]:
# Cek tabel df_rating
df_ratings.shape

(45073, 3)

### Pemodelan Recommender System : Collaborative Filterting

#### User to User Collaborative Filtering

In [91]:
# Pivot rating data untuk mendapatkan interaksi rating user dan item
pivot_df_ratings = df_ratings.pivot(index= 'UserID', columns='MovieIDs', values='Ratings')

In [92]:
# Mengecek pivot_df_ratings
pivot_df_ratings.head()

MovieIDs,34,253,292,296,457,527,661,908,1005,1036,...,2988,3175,3253,3386,3471,3504,3695,3699,3703,3705
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,5.0,3.0,,,,...,,,,,,,,,,
2,,,3.0,,4.0,,,,,,...,,,,,5.0,,,2.0,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,4.0,,,4.0,,,,4.0,,,...,,,,2.0,,,,,,


In [93]:
# Mengecek shape dari pivot_df_ratings
pivot_df_ratings.shape

(5766, 50)

In [94]:
# Total rating data null
pivot_df_ratings.isnull().sum().sum()

243227

#### Membuat fungsi `utility_data_preprocessing`

In [95]:
def utility_data_preprocessing(path):
    """"
    Fungsi untuk melakukan pivot dari df_rating menjadi bentuk utility metrics

    Parameters
    ----------
    path : str
        Lokasi (path) menyimpan rating data

    Returns
    -------
    pivot_df_ratings : pandas DataFrame
        Rating data dalam belum tabel yang sudah di pivot
    """
    # Import data

    df_rating = import_rating_data(path)

    # Pivot tabel
    pivot_df_ratings = df_rating.pivot(index= 'UserID', columns='MovieIDs', values='Ratings')

    # Shape data yang telah di pivot
    print('Shaped data setelah di pivot : ', pivot_df_ratings.shape)

    # Menghitung missing values
    print('Jumlah missing values pada utility metrics :', pivot_df_ratings.isnull().sum().sum())

    # Return pivot_df_ratings
    return pivot_df_ratings


In [96]:
# Output fungsi pivot data
pivot_df_ratings = utility_data_preprocessing(path = '../data/ratings.csv')

Shape data awal : (1000209, 4)
Shape data drop kolom : (1000209, 3)
Shaped data setelah di pivot :  (5855, 50)
Jumlah missing values pada utility metrics : 247169


In [97]:
pivot_df_ratings.head()

MovieIDs,20,28,235,356,435,454,489,585,589,914,...,3186,3269,3355,3481,3504,3535,3684,3685,3698,3807
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,3.0,...,4.0,,,,,,,,,
2,,,3.0,5.0,,,,,4.0,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,1.0,,,,,,,...,,,,,,,,,,


Berikut merupakan fungsi untuk prediksi rating


$$
\begin{align*}
\hat{r_{ui}} = \text{baseline}_{ui} + \frac{\sum_{j \in N(u)} (\text{Similarity}(u,j) \cdot (r_{ji}-\text{baseline}_{ji}) }{\sum_{j \in N(u)} \text{Similarity}(u,j)}
& \\ \\
\text{baseline}_{ui} = \mu + \text{userbias}_{u} + \text{itembias}_{i}
\end{align*}
$$


dengan :    

- $\text{baseline}_{ui}$ : baseline ratings dari user **u** untuk item **i**
- $\hat{r_{ui}}$ : prediksi rating dari user **u** untuk item **i**
- $N(u)$ : Tetangga dari user **u**


$$
\begin{align*}
\text{userbias}_{u} = \mu - \text{user_average}_{u}
& \\ \\
\text{itembias}_{i} = \mu - \text{item_average}_{i}
\end{align*}
$$

with :    

- $\mu$ : global mean
- $\text{user_average}_{u}$ : rata-rata rating dari user **u**
- $\text{item_average}_{i}$ : rata-rata rating dari item **i**


#### Menghitung baseline prediksi

In [98]:
pivot_df_ratings.head()

MovieIDs,20,28,235,356,435,454,489,585,589,914,...,3186,3269,3355,3481,3504,3535,3684,3685,3698,3807
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,3.0,...,4.0,,,,,,,,,
2,,,3.0,5.0,,,,,4.0,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,1.0,,,,,,,...,,,,,,,,,,


In [106]:
# Menghitung baseline rating pada user 5 dan movie 6
UserID = 2
MovieIDs = 235

# Menghitung Global Mean
global_mean = df_ratings['Ratings'].mean()

# Menghitung User Mean
user_mean = pivot_df_ratings.loc[UserID,:].mean()

# Menghitung Item Mean
item_mean = pivot_df_ratings.loc[:, MovieIDs].mean()

# Output dari baseline
print(f'UserID {UserID} Mean : {round(user_mean,2)}')
print(f'MovieIDs {MovieIDs} Mean : {round(item_mean,2)}')
print(f'Global Mean : {round(global_mean,2)}')

UserID 2 Mean : 3.71
MovieIDs 235 Mean : 3.7
Global Mean : 3.89


In [107]:
# Menghitung bias dari user
bias_user = global_mean - user_mean

# Menghitung bias dari item
bias_item = global_mean - item_mean

# Output dari bias user dan item
print(f'UserID {UserID} Bias : {round(bias_user,2)}')
print(f'MovieIDs {MovieIDs} Bias : {round(bias_item,2)}')

UserID 2 Bias : 0.18
MovieIDs 235 Bias : 0.19


In [108]:
# Menghitung total dari baseline
baseline_user_item = global_mean + bias_user + bias_item

# Output dari total baseline
print(f'Total baseline rating untuk prediksi UserID {UserID} dan MovieIDs {MovieIDs} adalah {round(baseline_user_item,2)}')

Total baseline rating untuk prediksi UserID 2 dan MovieIDs 235 adalah 4.26


#### Membuat fungsi `calculate_baseline_prediction`

In [109]:
def calculate_baseline_prediction(pivot_df_ratings, userid, movieid,
                                  df_ratings=df_ratings):
    """"
    Fungsi untuk menghitung nilai baseline prediksi dari user dan movie

    Parameters
    ---------
    pivot_df_ratings : pandas Dataframe
        DataFrame rating dari user dan movies yang telah dilakukan pivot table

    userid : int
        UserID dari user yang akan dihitung biasnya

    movieid : int
        MovieIDs dari film yang akan dihitung biasnya

    df_ratings : pandas Dataframe
        DataFrame rating user dan film

    Returns
    -------
    baseline_user_item : int
        Nilai baseline terhadap prediksi user dan item
    """
    # Menghitung global mean
    global_mean = df_ratings['Ratings'].mean()

    # Menghitung User Mean
    user_mean = pivot_df_ratings.loc[userid,:].mean()

    # Menghitung Item Mean
    item_mean = pivot_df_ratings.loc[:, movieid].mean()

    # Menghitung bias dari user
    bias_user = global_mean - user_mean

    # Menghitung bias dari item
    bias_item = global_mean - item_mean

    # Menghitung total dari baseline
    baseline_user_item = global_mean + bias_user + bias_item

    return baseline_user_item

In [110]:
# Validasi fungsi terhadap perhitungan semula
baseline_user_item_func = calculate_baseline_prediction(pivot_df_ratings=pivot_df_ratings,
                                                   userid=UserID, movieid=MovieIDs)
# Output dari total baseline
print(f'Total baseline rating untuk prediksi UserID {UserID} dan MovieIDs {MovieIDs} adalah {round(baseline_user_item_func,2)}')

Total baseline rating untuk prediksi UserID 2 dan MovieIDs 235 adalah 4.26


#### Mencari Tetangga Terdekat

In [111]:
# Mencari 5 tetangga terdekat menggunakan Cosine Similarity
user_mean_ = pivot_df_ratings.mean(axis=0)
user_mean_

MovieIDs
20      2.537500
28      4.055866
235     3.696872
356     4.087967
435     2.606004
454     3.547330
489     2.474227
585     2.906699
589     4.058513
914     4.154088
919     4.247963
953     4.299040
1032    3.697143
1097    3.965183
1210    4.022893
1276    4.253763
1285    3.815691
1294    4.124659
1333    3.882674
1343    3.671033
1391    2.900372
1449    4.147186
1645    3.435835
1831    2.584708
1923    3.904449
1967    3.637184
1968    3.879792
2010    4.082474
2028    4.337354
2133    3.275416
2249    3.192469
2333    3.813149
2396    4.127480
2706    3.709863
2917    4.031746
2948    3.889754
3016    3.158837
3113    2.629433
3129    3.701389
3175    3.771412
3186    3.477958
3269    3.203354
3355    2.875839
3481    3.928623
3504    4.020349
3535    3.219048
3684    3.374302
3685    3.592661
3698    3.198630
3807    3.273585
dtype: float64

In [112]:
# Kurangi rating dengan user rating 
rating_subtract_user_mean = (pivot_df_ratings - user_mean_).fillna(0)
rating_subtract_user_mean.head()

MovieIDs,20,28,235,356,435,454,489,585,589,914,...,3186,3269,3355,3481,3504,3535,3684,3685,3698,3807
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.154088,...,0.522042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,-0.696872,0.912033,0.0,0.0,0.0,0.0,-0.058513,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,-3.087967,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
# Lakukan looping untuk menghitung seluruh similarity
from sklearn.metrics.pairwise import cosine_similarity

# Gunakan progress bar
from tqdm import tqdm

In [114]:
# Menghitung similarity score
n_user = len(rating_subtract_user_mean.index)
similarity_val = np.zeros(n_user)

# Menghitung rating vektor dari user 5
target_user = rating_subtract_user_mean.loc[UserID].values.reshape(1,-1)

# Iterasi terhadap seluruh user
for i, neigh in enumerate(tqdm(rating_subtract_user_mean.index)):
    # Mencari user vektor
    user_neigh = rating_subtract_user_mean.loc[neigh].values.reshape(1,-1)

    # Menghitung similarities (Menggunakan cosine similarity)
    similar_i = cosine_similarity(target_user, user_neigh)

    similarity_val[i] = similar_i

100%|██████████| 5855/5855 [00:01<00:00, 4463.93it/s]


In [115]:
# Urutkan similarity_val secara descending
sort_index = np.argsort(similarity_val)[::-1]

# Jumlah nilai terdekat yang dicari
n = 5

# Mendapatkan nilai terdekat
n_similarity = rating_subtract_user_mean.index[sort_index[1: n+1]]
n_similarity

Int64Index([2580, 14, 2502, 79, 2834], dtype='int64', name='UserID')

In [116]:
print(f'Berikut {5} User yang memiliki kesamaan dengan User ID {UserID} :')
for i in range(len(n_similarity)):
    print(f'- User {n_similarity[i]}')

Berikut 5 User yang memiliki kesamaan dengan User ID 2 :
- User 2580
- User 14
- User 2502
- User 79
- User 2834


#### Membuat fungsi `get_n_neigh`

In [117]:
def get_n_neigh(rating_subtract_user_mean, userid, n=5):
    """
    Fungsi yang digunakan untuk mencari n user dengan nilai terdekat

    Parameters
    ----------
    rating_subtract_user_mean : pandas DataFrame,
        Rating dari user yang telah dikurangi dengan bias dari user
    
    userid : int,
        ID dari user pada utility matrix    

    n : int,
        Jumlah n user yang nilainya paling dekat dengan userid

    Returns
    -------
    n_similarities : dict,
        n_similarity --> Berisi n user dengan id termirip dengan userid berdasarkan rating
        similarity_val --> Nilai rating dari n user tersebut
    """
    # Menghitung similarity score
    n_user = len(rating_subtract_user_mean.index)
    similarity_val = np.zeros(n_user)

    # Menghitung rating vektor dari user 5
    target_user = rating_subtract_user_mean.loc[userid].values.reshape(1,-1)

    # Iterasi terhadap seluruh user
    for i, neigh in enumerate(tqdm(rating_subtract_user_mean.index)):
        # Mencari user vektor
        user_neigh = rating_subtract_user_mean.loc[neigh].values.reshape(1,-1)

        # Menghitung similarities (Menggunakan cosine similarity)
        similar_i = cosine_similarity(target_user, user_neigh)

        similarity_val[i] = similar_i

    # Urutkan similarity_val secara descending
    sort_index = np.argsort(similarity_val)[::-1]

    # Urutkan similarity val secara descending
    similarity_val = np.sort(similarity_val)[::-1]

    # Mendapatkan nilai terdekat
    n_similarity = rating_subtract_user_mean.index[sort_index[1: n+1]].tolist()

    # Mendapatkan n_similarity_val
    n_similarity_val = list(similarity_val[1:n+1])

    # Buat dictionary n_similarities
    n_similarities = {
            'n_similarity' : n_similarity,
            'n_similarity_val' : n_similarity_val
        }

    # Return n_similarities
    return n_similarities
 

In [118]:
list_neigh = get_n_neigh(rating_subtract_user_mean= rating_subtract_user_mean, userid=2, n=5)
list_neigh

  0%|          | 0/5855 [00:00<?, ?it/s]

100%|██████████| 5855/5855 [00:01<00:00, 4390.29it/s]


{'n_similarity': [2580, 14, 2502, 79, 2834],
 'n_similarity_val': [0.853325994145441,
  0.843861248775576,
  0.8419274541862345,
  0.8419274541862345,
  0.8419274541862345]}

#### Melakukan Prediksi Rating pada Utility Matrix

In [119]:
pivot_df_ratings.head()

MovieIDs,20,28,235,356,435,454,489,585,589,914,...,3186,3269,3355,3481,3504,3535,3684,3685,3698,3807
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,3.0,...,4.0,,,,,,,,,
2,,,3.0,5.0,,,,,4.0,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,1.0,,,,,,,...,,,,,,,,,,


In [144]:
# Menghitung baseline dari u, j
# Definisikan placeholder
n = 5
total_rating_sim = 0
sim_sum = 0

# Iterasi untuk seluruh n user
for i in range(n):
    # Mendapatkan rating dari n-user
    neigh_rating = pivot_df_ratings.loc[list_neigh['n_similarity'][i],235]
    print(neigh_rating)

    # Abaikan nilai NaN
    if np.isnan(neigh_rating):
        continue

    # Menghitung nilai baseline (j, i)
    baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                             userid = list_neigh['n_similarity'][i], movieid=235)
    
    # Kurangi baseline dengan rating
    substracted_rating = neigh_rating - baseline

    # Kalikan dengan similarity
    rating_sim = list_neigh['n_similarity_val'][i]*substracted_rating

    # Jumlah similarity dikalikan dengan rating
    total_rating_sim += rating_sim

    # Jumlah total similarity
    sim_sum += list_neigh['n_similarity_val'][i]

rating_user_item_predicted = baseline + (total_rating_sim / sim_sum)

print(f'Prediksi rating untuk UserID {UserID}, dan MovieID {MovieIDs} adalah {round(rating_user_item_predicted,2)}')


nan
nan
nan
nan
nan


ZeroDivisionError: division by zero

#### Membuat fungsi `rating_predicted_item`

In [36]:
def rating_predicted_item(userid, movieid, pivot_df_ratings, list_neigh, n,
                          max_rating = 5, min_rating = 1):
    """
    Fungsi untuk melakukan prediksi rating pada UserID dan MoviesIDs

    Parameters
    ---------
    userid : int,
        ID dari user pada utility matrix

    movieid : int,
        ID dari movie pada utility matrix

    pivot_df_ratings : pandas DataFrame,
        DataFrame rating dari user dan movies yang telah dilakukan pivot table

    list_neigh : list,
        n user terdekat dengan userid untuk rating movieid

    n : int,
        Jumlah n tetangga terdekat dengan userid

    max_rating : int (default=5),
        Rating maksimal yang dapat diberikan oleh User
    
    min_rating : int (default=1),
        Rating minimal yang dapati diberikan oleh User

    Returns
    -------
    rating_user_item_predicted : int,
        Prediksi rating dari movie pada userid dan movie tersebut   
    """
    # Menghapus user mean pada rating
    user_mean_ = pivot_df_ratings.mean(axis=0)
    rating_subtract_user_mean = (pivot_df_ratings - user_mean_).fillna(0)

    # Menghitung baseline (u, i)
    baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                            userid = userid, movieid= movieid)
    
    total_rating_sim = 0
    sim_sum = 0
    # Iterasi untuk mendapatkan jumlah rating
    for i in range(n):
    # Mendapatkan rating dari n-user
        neigh_rating = pivot_df_ratings.loc[list_neigh['n_similarity'][i],movieid]
        print(neigh_rating)

        # Abaikan nilai NaN
        if np.isnan(neigh_rating):
            continue

        # Menghitung nilai baseline (j, i)
        baseline = calculate_baseline_prediction(pivot_df_ratings= pivot_df_ratings,
                                                userid = list_neigh['n_similarity'][i], movieid=movieid)
        
        # Kurangi baseline dengan rating
        substracted_rating = neigh_rating - baseline

        # Kalikan dengan similarity
        rating_sim = list_neigh['n_similarity_val'][i]*substracted_rating

        # Jumlah similarity dikalikan dengan rating
        total_rating_sim += rating_sim

        # Jumlah total similarity
        sim_sum += list_neigh['n_similarity_val'][i]
    
    # Defence terhadap ZeroDivisionError
    try :
        rating_user_item_predicted = baseline + (total_rating_sim / sim_sum)

    except ZeroDivisionError:
        rating_user_item_predicted = baseline

    # Cek terhadap batas dari rating (max_rating / min_rating)
    if rating_user_item_predicted > max_rating:
        rating_user_item_predicted = max_rating

    elif rating_user_item_predicted < min_rating:
        rating_user_item_predicted = min_rating

    return round(rating_user_item_predicted,2)

In [121]:
# Melakukan pengecekan fungsi `rating_predicted_item`
rating_predicted_item(userid = 2, movieid = 235,
                      pivot_df_ratings = pivot_df_ratings,
                      list_neigh = list_neigh, n = 5)

nan
nan
nan
nan
nan


4.26

#### Generate Recommendation

In [124]:
user_id = 2
# Membuat placeholder (dataframe) untuk hasil prediksi
df_pred = pd.DataFrame()

# Membuat placeholder (list) untuk hasil prediksi
pred_rating = []

# Iterasi untuk semua movie unrated
masked = np.isnan(pivot_df_ratings.loc[user_id])
pivot_df_ratings

for mov in tqdm(pivot_df_ratings.columns[masked]):
    # Prediksi rating movie
    predictions = rating_predicted_item(userid= user_id, movieid= mov,
                                        pivot_df_ratings= pivot_df_ratings,
                                        list_neigh= list_neigh, n = n)
    
    # Tambahkan rating ke placeholder
    pred_rating.append(predictions)

# Assign ke MovieID
df_pred['MovieIDs'] = pivot_df_ratings.columns[masked]

# Assign hasil dari prediksi
df_pred['pred_rating'] = pred_rating


 21%|██        | 9/43 [00:00<00:00, 79.40it/s]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


100%|██████████| 43/43 [00:00<00:00, 103.59it/s]

nan
nan
nan
nan
nan
4.0
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan





In [125]:
# Urutkan nilai rating secara descending
n_item = 5
df_pred = (df_pred.
           sort_values('pred_rating', ascending=False).
           head(n_item))

df_pred

Unnamed: 0,MovieIDs,pred_rating
0,20,5.0
16,1391,5.0
2,435,5.0
4,489,5.0
5,585,5.0


#### Membuat fungsi `recommendation_movies`

In [128]:
def recommendation_movies(pivot_df_ratings, userid, n, n_item,
                          recommend_seen = False):
    """
    Fungsi untuk mendapatkan rekomendasi berdasarkan userid

    Parameters
    ----------
    pivot_df_ratings : pandas DataFrame,
        Rating data dalam belum tabel yang sudah di pivot

    userid : int,
        UserID dari user yang akan dihitung biasnya

    n : int,
        Jumlah n user yang nilainya paling dekat dengan userid

    n_item : int,
        Jumlah item dalam integer
    
    recommend_seen : bool,
        Nilai default adalah False
    """
    # Mencari nilai tetangga terdekat
    neigh_data = get_n_neigh(rating_subtract_user_mean=rating_subtract_user_mean,
                             userid=userid, n=n)
    
    # Membuat placeholder (dataframe) untuk hasil prediksi
    df_pred = pd.DataFrame()

    # Membuat placeholder (list) untuk hasil prediksi
    pred_rating = []

    # Iterasi untuk semua movie unrated
    masked = np.isnan(pivot_df_ratings.loc[userid])
    item_to_predict = pivot_df_ratings.columns[masked]

    if recommend_seen :
        item_to_predict = pivot_df_ratings.columns

    # Iterasi untuk semua movie
    for mov in tqdm(item_to_predict):
        # Prediksi rating movie
        predictions = rating_predicted_item(userid = userid, movieid = mov,
                                            pivot_df_ratings = pivot_df_ratings,
                                            list_neigh= list_neigh, n = n)
        
        # Tambahkan rating ke placeholder
        pred_rating.append(predictions)

    # Assign ke MovieID
    df_pred['MovieIDs'] = pivot_df_ratings.columns[masked]

    # Assign hasil dari prediksi
    df_pred['pred_rating'] = pred_rating

    df_pred = (df_pred
            .sort_values('pred_rating',ascending=False)
            .head(n_item))

    return df_pred

In [132]:
user_5_rec = recommendation_movies(pivot_df_ratings=pivot_df_ratings, userid=2, n=5, n_item=5,
                                   recommend_seen=False)

user_5_rec

  0%|          | 0/5855 [00:00<?, ?it/s]

100%|██████████| 5855/5855 [00:01<00:00, 3849.59it/s]
 28%|██▊       | 12/43 [00:00<00:00, 116.15it/s]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
4.0
nan
nan
nan
nan


 84%|████████▎ | 36/43 [00:00<00:00, 96.95it/s] 

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


100%|██████████| 43/43 [00:00<00:00, 99.46it/s]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan





Unnamed: 0,MovieIDs,pred_rating
0,20,5.0
16,1391,5.0
2,435,5.0
4,489,5.0
5,585,5.0


#### Konversi ke Movie Data

In [137]:
def import_movie_data(movie_path, m_cols):
    """
    Fungsi yang digunakan untuk import movie.csv 

    Parameters
    ----------
    movie_path : str
        Path movie.csv

    m_cols : list
        Nama kolom dari movies.csv
    Returns
    -------
    movie_df : pandas DataFrame
        DataFrame dari movie
    """
    # Import data
    movie_df = pd.read_csv(movie_path, sep='::', names=m_cols, encoding='latin-1', index_col=None, engine='python')
    
    print('Shape dari movie_df :', movie_df.shape)
    return movie_df

In [140]:
# Import movie.csv
movie_path = '../data/movies.csv'
m_cols = ['MovieIDs', 'Title', 'Genre']

# Ambil DataFrame dari movies.csv
movie_df = import_movie_data(movie_path = movie_path, m_cols = m_cols)
movie_df.head()

Shape dari movie_df : (3883, 3)


Unnamed: 0,MovieIDs,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [141]:
user_5_rec['MovieName'] = movie_df.loc[user_5_rec['MovieIDs'], 'Title'].values
user_5_rec['Genres'] = movie_df.loc[user_5_rec['MovieIDs'], 'Genre'].values

user_5_rec

Unnamed: 0,MovieIDs,pred_rating,MovieName,Genres
0,20,5.0,Get Shorty (1995),Action|Comedy|Drama
16,1391,5.0,Mother (1996),Comedy
2,435,5.0,Dangerous Game (1993),Drama
4,489,5.0,Menace II Society (1993),Action|Crime|Drama
5,585,5.0,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller


#### Train Recommender System