## Collaborative Filtering Approach

### 1. Import Module

In [31]:
import pandas as pd
import numpy as np

### 2. Import Data

In [32]:
# Define column name 
m_cols = ['MovieIDS', 'Title', 'Genre']
r_cols = ['UserID', 'MovieIDs', 'Ratings', 'Timestamp']
u_cols = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# Read csv of movies, ratings and users
df_movies = pd.read_csv('../data/movies.csv', sep='::', names=m_cols, encoding='latin-1', index_col=None, engine='python')
df_ratings = pd.read_csv('../data/ratings.csv', sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
df_users = pd.read_csv('../data/users.csv', sep='::', names=u_cols, encoding='latin-1', index_col=None, engine='python')

In [13]:
# Cek shape dari dataframe
print(f"Shape dari movies : {df_movies.shape}")
print(f"Shape dari rating : {df_ratings.shape}")
print(f"Shape dari user : {df_users.shape}")

Shape dari movies : (3883, 3)
Shape dari rating : (1000209, 4)
Shape dari user : (6040, 5)


### 3. Pengecekan `"df_ratings"`

In [33]:
# Menampilkan df_ratings
df_ratings.head()

Unnamed: 0,UserID,MovieIDs,Ratings,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [37]:
df_ratings.describe()

Unnamed: 0,UserID,MovieIDs,Ratings
count,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564
std,1728.413,1096.041,1.117102
min,1.0,1.0,1.0
25%,1506.0,1030.0,3.0
50%,3070.0,1835.0,4.0
75%,4476.0,2770.0,4.0
max,6040.0,3952.0,5.0


In [34]:
# Drop kolom yang tidak dibutuhkan untuk pemodelan 'Timestamp'
df_ratings = df_ratings.drop('Timestamp', axis=1)

In [36]:
# Cek tipe dari masing masing kolom
df_ratings.dtypes

UserID      int64
MovieIDs    int64
Ratings     int64
dtype: object

Untuk memodelkan rating dengan skala 0-5, diperlukan nilai float untuk mengakomodir angka desimal

In [38]:
# Konversi kolom "Ratings" menjadi float
df_ratings["Ratings"] = df_ratings["Ratings"].astype(float)

Handling terhadap data duplikat

In [39]:
# Menghitung record yang duplikat
df_ratings.duplicated().sum()

0

### 4. Membuat fungsi `import_rating_data`

In [None]:
def import_data_data(path, frac=0.01):
    """
    Fungsi untuk import rating data, pengecekan terhadap shape, dan duplikasi data

    Parameters
    ----------
    path : str
        Lokasi (path) data ratings disimpan

    Returns
    -------
    rating_df : pandas DataFrame
        Sample dari rating data    
    """
    # Load data
    df_rating_raw = pd.read_csv(path, sep='::', names=r_cols, encoding='latin-1', index_col=None, engine='python')
