# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from surprise import accuracy, Dataset, Reader, AlgoBase, SVD, KNNBaseline
from surprise.model_selection import cross_validate
import optuna
import pickle

# Load Dataset

In [2]:
df_rating = pd.read_csv('/home/adhitizki/playground/pacmann/pacmann_recsys/data/data_2/rating.csv')
df_desc = pd.read_csv('/home/adhitizki/playground/pacmann/pacmann_recsys/data/data_2/anime.csv')

# Data Preprocessing

In [3]:
# filter while rating is upper -1
df_rating = df_rating[df_rating['rating']>-1].reset_index(drop=True)

# user minimum watch 10 animes
user_id = df_rating['user_id'].value_counts()
user_id = user_id[user_id >= 10].index
df_rating = df_rating[df_rating['user_id'].isin(user_id)]

df_rating = df_rating.reset_index(drop=True)

In [4]:
df_rating

Unnamed: 0,user_id,anime_id,rating
0,3,20,8
1,3,154,6
2,3,170,9
3,3,199,10
4,3,225,9
...,...,...,...
6278354,73515,13659,8
6278355,73515,14345,7
6278356,73515,16512,7
6278357,73515,17187,9


In [5]:
reader = Reader(rating_scale = (1, 10))

utility_data = Dataset.load_from_df(
                    df = df_rating[['user_id', 'anime_id', 'rating']].copy(),
                    reader = reader
                )

In [6]:
### credit pacmann

import copy

# Create a function
def train_test_split(utility_data, test_size=0.2, random_state=42):
    """
    Train test split the data
    ref: https://surprise.readthedocs.io/en/stable/FAQ.html#split-data-for-unbiased-estimation-py

    Parameters
    ----------
    utility_data : Surprise utility data
        The sample of whole data set

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility

    Returns
    -------
    full_data : Surprise utility data
        The new utility data

    train_data : Surprise format
        The train data

    test_data : Surprise format
        The test data
    """
    # Deep copy the utility_data
    full_data = copy.deepcopy(utility_data)

    # Generate random seed
    np.random.seed(random_state)

    # Shuffle the raw_ratings for reproducibility
    raw_ratings = full_data.raw_ratings
    np.random.shuffle(raw_ratings)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_ratings))

    # Split the data
    train_raw_ratings = raw_ratings[:threshold]
    test_raw_ratings = raw_ratings[threshold:]

    # Get the data
    full_data.raw_ratings = train_raw_ratings
    train_data = full_data.build_full_trainset()
    test_data = full_data.construct_testset(test_raw_ratings)

    return full_data, train_data, test_data


In [7]:
full_data, train_data, test_data = train_test_split(utility_data)

# Modelling

### - Mean Predict (Baseline)

In [8]:
### credit pacmann

class MeanPrediction(AlgoBase):
    '''Baseline prediction. Return global mean as prediction'''
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        '''Fit the train data'''
        AlgoBase.fit(self, trainset)

    def estimate(self, u, i):
        '''Perform the estimation/prediction.'''
        est = self.trainset.global_mean
        return est

In [9]:
model_baseline = MeanPrediction()
cv_baseline = cross_validate(model_baseline, full_data, measures=["RMSE"], cv=5, verbose=True)
rmse_cv_baseline = cv_baseline['test_rmse'].mean()

Evaluating RMSE of algorithm MeanPrediction on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5695  1.5720  1.5722  1.5711  1.5722  1.5714  0.0010  
Fit time          0.00    1.98    1.96    2.00    2.00    1.59    0.80    
Test time         25.39   28.37   21.80   25.73   22.45   24.75   2.39    


In [10]:
# Creating baseline model instance
model_baseline = MeanPrediction()
model_baseline.fit(train_data)

In [11]:
prediction = model_baseline.test(test_data)
rmse_baseline = accuracy.rmse(prediction)

RMSE: 1.5714


In [12]:
with open('model/model_baseline.pkl','wb') as f:
    pickle.dump(model_baseline, f)

### - SVD

In [13]:
def objective(trial):
    # Define hyperparameters to optimize
    n_factors = trial.suggest_int('n_factors', 10, 100, 10)
    n_epochs = trial.suggest_int('n_epochs', 20, 100, 20)
    lr_all = trial.suggest_float('lr_all', 1e-4, 1, log=True)
    reg_all = trial.suggest_float('reg_all', 1e-4, 1, log=True)

    # Create an SVD model with the suggested hyperparameters
    model = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)

    cv_results = cross_validate(model, full_data, measures=['RMSE'], cv=5, verbose=False)
    
    return cv_results['test_rmse'].mean()

study_svd = optuna.create_study(direction='minimize')
study_svd.optimize(objective, n_trials=10, timeout=60*60, n_jobs=3)

[I 2023-09-10 09:38:31,505] A new study created in memory with name: no-name-db333670-cf0e-4922-8d63-88537ea64d2e
[I 2023-09-10 10:35:31,271] Trial 1 finished with value: 2.7033869197111366 and parameters: {'n_factors': 10, 'n_epochs': 100, 'lr_all': 0.7606123446416528, 'reg_all': 0.0817987957195958}. Best is trial 1 with value: 2.7033869197111366.
[I 2023-09-10 10:37:21,061] Trial 2 finished with value: 1.256672632072373 and parameters: {'n_factors': 80, 'n_epochs': 60, 'lr_all': 0.0001361232602013595, 'reg_all': 0.0010238315064886156}. Best is trial 2 with value: 1.256672632072373.
[I 2023-09-10 10:37:24,518] Trial 0 finished with value: 1.166146896270838 and parameters: {'n_factors': 40, 'n_epochs': 80, 'lr_all': 0.0011428791120303447, 'reg_all': 0.09327928792477466}. Best is trial 0 with value: 1.166146896270838.
[I 2023-09-10 11:17:06,518] Trial 3 finished with value: 2.70338679735483 and parameters: {'n_factors': 70, 'n_epochs': 20, 'lr_all': 0.04585890337146581, 'reg_all': 0.005

In [14]:
model_svd = SVD(**study_svd.best_trial.params)
model_svd.fit(train_data)

prediction = model_svd.test(test_data)
rmse_svd = accuracy.rmse(prediction)

RMSE: 1.1575


In [15]:
with open('model/model_svd.pkl','wb') as f:
    pickle.dump(model_svd, f)

## Evaluation

In [17]:
summary_df = pd.DataFrame({'Model': ['Baseline', 'SVD'],
                           'CV Performance - RMSE': [rmse_cv_baseline, study_svd.best_value],
                           'Model Configuration':['N/A', study_svd.best_trial.params]
                        })

summary_df

Unnamed: 0,Model,CV Performance - RMSE,Model Configuration
0,Baseline,1.571381,
1,SVD,1.166147,"{'n_factors': 40, 'n_epochs': 80, 'lr_all': 0...."


# Generate Prediction

In [25]:
from prediction import AnimeRecSys

user_id = 10
AnimeRecSys(user_id).top_predict()

Unnamed: 0,user_id,anime_id,predict_rating,name,genre
712,10,820,9.337494,Prince of Tennis: Another Story - Messages Fro...,"Comedy, School, Shounen, Sports"
3911,10,5114,9.161407,Pokemon Fushigi no Dungeon: Magnagate to Mugen...,"Adventure, Fantasy, Kids"
5783,10,9969,9.146776,Phantasm,"Dementia, Music"
5510,10,9253,9.145071,Kitty to Mimmy no Happy Birthday,"Fantasy, Kids"
6198,10,11061,9.119065,Renai Boukun,"Comedy, Romance, School, Supernatural"
805,10,918,9.111605,One Piece Movie 6: Omatsuri Danshaku to Himits...,"Adventure, Comedy, Fantasy, Shounen"
3480,10,4181,8.998173,Bayonetta: Bloody Fate,"Action, Demons, Fantasy, Super Power"
2607,10,2904,8.984767,Captain Future,"Action, Adventure, Sci-Fi, Shounen"
9,10,19,8.927449,Code Geass: Hangyaku no Lelouch,"Action, Mecha, Military, School, Sci-Fi, Super..."
2619,10,2921,8.890382,Ochamegami Monogatari: Korokoro Pollon,"Comedy, Magic, Parody"


: 