In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns 

from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds 

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import SVD

from surprise.similarities import cosine, msd, pearson

from surprise import accuracy 
from surprise import Reader
from surprise import Dataset


# Preprocessing

In [2]:
anime_df = pd.read_csv('./anime-recommendations-database/anime.csv')
anime_df.head(50)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [3]:
anime_df.info() #having a look at all of the columns and types from the above cell and current to remove 
#any unneccessary extraneous data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
anime_id    12294 non-null int64
name        12294 non-null object
genre       12232 non-null object
type        12269 non-null object
episodes    12294 non-null object
rating      12064 non-null float64
members     12294 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
rating_df = pd.read_csv('./anime-recommendations-database/rating.csv')
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


Need to join both datasets together, will join using anime_id then hopefully will reshuffle the dataset so that it can be transformed into something that is compatible with `surprise`. 

In [5]:
df_row = pd.concat([anime_df, rating_df], ignore_index=True, sort=True)
df_row.head()

Unnamed: 0,anime_id,episodes,genre,members,name,rating,type,user_id
0,32281,1,"Drama, Romance, School, Supernatural",200630.0,Kimi no Na wa.,9.37,Movie,
1,5114,64,"Action, Adventure, Drama, Fantasy, Magic, Mili...",793665.0,Fullmetal Alchemist: Brotherhood,9.26,TV,
2,28977,51,"Action, Comedy, Historical, Parody, Samurai, S...",114262.0,Gintama°,9.25,TV,
3,9253,24,"Sci-Fi, Thriller",673572.0,Steins;Gate,9.17,TV,
4,9969,51,"Action, Comedy, Historical, Parody, Samurai, S...",151266.0,Gintama&#039;,9.16,TV,


After joining the two dataframes I will merge them so that they can correspond to the correct anime_ids

In [None]:
df_merge = pd.merge(df_row, rating_df, on = 'anime_id')
df_merge.head(15)
# df_merge = df_row.merge(rating_df, left_on= 'anime_id', how='outer')
# df_merge.head()

In [None]:
reader = Reader()
anime_data = Dataset.load_from_df(anime_df[['anime_id', '', reader)
anime_split = anime_data.split(n_folds=5) 
rating_data = Dataset.load_from_df(rating_df, reader)