# Fazendo download do dataset

In [1]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, KFold, GridSearchCV
from surprise import SVD
from surprise import accuracy
import pandas as pd

In [2]:
df = pd.read_csv("animelist.csv" , usecols=['rating','anime_id', 'user_id'], nrows=175000)

In [3]:
df['rating'] = pd.to_numeric(df['rating'], downcast='unsigned')
df['anime_id'] = pd.to_numeric(df['anime_id'], downcast='unsigned')
df['user_id'] = pd.to_numeric(df['user_id'], downcast='unsigned')

In [4]:
df.sample(10)

Unnamed: 0,user_id,anime_id,rating
157402,526,24873,0
110406,372,29095,8
163109,546,28891,7
77902,275,1704,3
117964,402,35716,0
39407,146,10457,0
44431,156,4722,8
16669,60,5214,0
79067,281,3712,8
35025,129,13125,10


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 3 columns):
user_id     175000 non-null uint16
anime_id    175000 non-null uint16
rating      175000 non-null uint8
dtypes: uint16(2), uint8(1)
memory usage: 854.6 KB


In [6]:
df.describe()

Unnamed: 0,user_id,anime_id,rating
count,175000.0,175000.0,175000.0
mean,299.080383,16273.015514,4.068537
std,165.413132,13737.22437,3.977567
min,0.0,1.0,0.0
25%,155.0,2969.0,0.0
50%,306.0,12189.0,5.0
75%,440.0,30503.0,8.0
max,590.0,48456.0,10.0


In [7]:
user_extras = {"Jorge":"userlist.csv", "Juan":"juanlist.csv", "Lucas":"disneylist.csv", "Arthur":"furrylist.csv", "Felippe":"mlklist.csv", "Guilherme":"spaghettilist.csv", "Liu":"rekeesuilist.csv", "Tutu":"lolilist.csv"}

In [8]:
initial_max = df["user_id"].max()
for i in user_extras.items():
    max_user_id = df["user_id"].max()
    df_temp = pd.read_csv(i[1], usecols=['animeId','score'])
    df_temp["user_id"] = max_user_id + 1
    user_extras[i[0]] = max_user_id + 1
    df_temp = df_temp.rename(columns={"score": "rating", "animeId": "anime_id"})
    df = df.append(df_temp, sort=True)
df = df.reset_index(drop=True)

In [9]:
print(user_extras)

{'Jorge': 591, 'Juan': 592, 'Lucas': 593, 'Arthur': 594, 'Felippe': 595, 'Guilherme': 596, 'Liu': 597, 'Tutu': 598}


In [10]:
df.tail()

Unnamed: 0,anime_id,rating,user_id
176010,35073,7,598
176011,26243,6,598
176012,28927,6,598
176013,27775,7,598
176014,527,7,598


In [11]:
df["user_id"].value_counts(normalize=True)

436    0.014760
146    0.013402
240    0.012823
446    0.011408
172    0.010596
         ...   
449    0.000006
396    0.000006
106    0.000006
203    0.000006
443    0.000006
Name: user_id, Length: 545, dtype: float64

In [12]:
df["anime_id"].value_counts(normalize=True)

1535     0.002403
16498    0.002085
5114     0.002023
1575     0.001920
11757    0.001915
           ...   
2483     0.000006
434      0.000006
34979    0.000006
1713     0.000006
2047     0.000006
Name: anime_id, Length: 9283, dtype: float64

In [13]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
df = Dataset.load_from_df(df[['user_id', 'anime_id', 'rating']], reader)

In [14]:
from collections import defaultdict

trainset = df.build_full_trainset()
algo = SVD(n_epochs=15, lr_all=0.004, reg_all=0.15)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

accuracy.rmse(predictions, verbose=True)

RMSE: 1.8434


1.843361706086956

In [15]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def get_user_top_n(predictions, user_id, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        if uid == user_id:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [16]:
top_n = get_top_n(predictions, n=10)
df_animes = pd.read_csv("anime.csv" , usecols=['MAL_ID', 'Name'])


In [19]:
df_custom = pd.DataFrame(columns=['User','Name'])
key_list = list(user_extras.keys())
for i in range(initial_max+1, max_user_id+2) :
    reco_list = top_n[i]
    for j in reco_list:
        anime = df_animes.loc[df_animes['MAL_ID'] == j[0]]
        df_custom = df_custom.append({
             "Name": anime["Name"].values[0],
             "User": key_list[i-initial_max-1]
              }, ignore_index=True)
df_custom = df_custom.sort_values("User")
df_custom.tail(30)

Unnamed: 0,User,Name
62,Liu,Tenki no Ko
64,Liu,Detective Conan Movie 07: Crossroad in the Anc...
65,Liu,Ansatsu Kyoushitsu 2nd Season
66,Liu,Baccano! Specials
67,Liu,Ansatsu Kyoushitsu
60,Liu,Baccano!
68,Liu,Dragon Ball Z Movie 07: Kyokugen Battle!! Sand...
69,Liu,Shingeki no Kyojin: The Final Season
63,Liu,Isekai Quartet 2
61,Liu,Grand Blue


In [None]:
from surprise import SVD, Dataset, NormalPredictor, Reader, accuracy
from surprise.model_selection import cross_validate, KFold, GridSearchCV

param_grid = {'n_epochs': [10, 15, 25], 'lr_all': [0.002, 0.004, 0.008],
              'reg_all': [0.03, 0.09, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(df)


# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


In [None]:
estim = gs.best_estimator['rmse']

In [None]:
estim.fit(trainset)
predictions_best = estim.test(testset)
accuracy.rmse(predictions_best, verbose=True)

In [None]:
df_custom = pd.DataFrame(columns=['User','Name', 'Rating'])
key_list = list(user_extras.keys())
for i in range(initial_max+1, max_user_id+2) :
    reco_list = top_n[i]
    for j in reco_list:
        anime = df_animes.loc[df_animes['MAL_ID'] == j[0]]
        df_custom = df_custom.append({
             "Name": anime["Name"].values[0],
             "Rating":  j[1],
             "User": key_list[i-initial_max-1]
              }, ignore_index=True)
df_custom = df_custom.sort_values("User")
df_custom