Nama : Muhammad Aldi Surya Putra \
Asal : Universitas Pendidikan Indonesia

Anime Recommendation

# Mengambil dataset dari kaggle

## Menghubungkan colab dan kaggle

In [31]:
! pip install -q kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

## Download dan unzip dataset

In [32]:
! kaggle datasets download -d CooperUnion/anime-recommendations-database
! unzip anime-recommendations-database.zip -d anime

anime-recommendations-database.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  anime-recommendations-database.zip
replace anime/anime.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: anime/anime.csv         
replace anime/rating.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: anime/rating.csv        


In [33]:
# Import library yang dibutuhkan

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Data Exploratory

In [34]:
df = pd.read_csv('/content/anime/anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Mengecek informasi terkait dataset

In [35]:
print("*Jumlah kolom dan index*")
print(f"Index : {df.shape[0]}\n Kolom : {df.shape[1]}")
print("\n")
print("*Kolom pada dataframe*")
print(df.columns)
print("\n")
print("*Info pada kolom*")
print(df.info())
print("\n")
print("*Info dataframe*")
print(df.describe())

*Jumlah kolom dan index*
Index : 12294
 Kolom : 7


*Kolom pada dataframe*
Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


*Info pada kolom*
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


*Info dataframe*
           anime_id        rating       members
count  12294.000000  12064.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.026746  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000     

# Data Preparation

## Mengatasi missing value pada dataset

### Jumlah missing value pada tiap kolom dataset

In [36]:
print(df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


### Drop data yang missing

In [37]:
df.dropna(inplace=True)

In [38]:
print(df.isnull().sum())

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


## Menghapus kolom - kolom yang tidak dibutuhkan

In [39]:
df.drop(['type', 'episodes', 'members'], axis=1, inplace=True)
df

Unnamed: 0,anime_id,name,genre,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16
...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,4.15
12290,5543,Under World,Hentai,4.28
12291,5621,Violence Gekiga David no Hoshi,Hentai,4.88
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,4.98


## Sorting data berdasarkan 'anime_id'

In [40]:
df = df.sort_values('anime_id', ignore_index=True)

In [41]:
df

Unnamed: 0,anime_id,name,genre,rating
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",8.82
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",8.40
2,6,Trigun,"Action, Comedy, Sci-Fi",8.32
3,7,Witch Hunter Robin,"Action, Drama, Magic, Mystery, Police, Superna...",7.36
4,8,Beet the Vandel Buster,"Adventure, Fantasy, Shounen, Supernatural",7.06
...,...,...,...,...
12012,34476,Platonic Chain: Ansatsu Jikkouchuu,"Sci-Fi, Slice of Life",1.67
12013,34490,Sushi Azarashi,Comedy,3.00
12014,34503,Kochinpa! Dainiki,Comedy,3.40
12015,34514,Pokemon Generations,"Action, Adventure, Fantasy, Game, Kids",7.21


# Model Development

## Melakukan perhitungan IDF pada data kolom 'genre'

In [42]:
vectorizer = TfidfVectorizer()
genre_name = vectorizer.fit(df['genre'])
genre_name.get_feature_names()

['action',
 'adventure',
 'ai',
 'arts',
 'cars',
 'comedy',
 'dementia',
 'demons',
 'drama',
 'ecchi',
 'fantasy',
 'fi',
 'game',
 'harem',
 'hentai',
 'historical',
 'horror',
 'josei',
 'kids',
 'life',
 'magic',
 'martial',
 'mecha',
 'military',
 'music',
 'mystery',
 'of',
 'parody',
 'police',
 'power',
 'psychological',
 'romance',
 'samurai',
 'school',
 'sci',
 'seinen',
 'shoujo',
 'shounen',
 'slice',
 'space',
 'sports',
 'super',
 'supernatural',
 'thriller',
 'vampire',
 'yaoi',
 'yuri']

## Fit dan Transform kolom 'genre' menjadi vector

In [43]:
vectors = vectorizer.fit_transform(df['genre'])

## Mengubah vektor menjadi matriks

In [44]:
vectors.todense()

matrix([[0.31832632, 0.34131325, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.30162321, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.4901204 , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.32821436, 0.35191532, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

## Menghitung Cosine Similiarity pada matriks

In [45]:
cos_sim = cosine_similarity(vectors)

In [46]:
cos_sim.shape

(12017, 12017)

## Membuat Dataframe yang berisi Cosine Similiarity tiap anime

In [47]:
cos_sim_df = pd.DataFrame(vectors.todense(),
             columns=genre_name.get_feature_names(),
             index=df.name
).sample(len(genre_name.get_feature_names()), axis=1).sample(10, axis=0)

cos_sim_df

Unnamed: 0_level_0,power,drama,super,comedy,of,magic,supernatural,romance,harem,hentai,sci,vampire,school,space,samurai,life,shounen,mecha,slice,action,yaoi,horror,game,josei,ai,thriller,military,parody,psychological,police,martial,mystery,fi,shoujo,seinen,sports,adventure,ecchi,demons,yuri,dementia,kids,arts,historical,music,fantasy,cars
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
Zero no Tsukaima: Princesses no Rondo Picture Drama,0.0,0.274676,0.0,0.192524,0.0,0.369924,0.0,0.305905,0.454943,0.0,0.0,0.0,0.325522,0.0,0.0,0.0,0.0,0.0,0.0,0.241726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259182,0.386895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262361,0.0
Tales of the Abyss,0.0,0.597326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.570545,0.0
Beast Saga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443362,0.0,0.0,0.0,0.0,0.0,0.467991,0.0,0.0,0.39431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.482043,0.0,0.0,0.0,0.0,0.0
Kinnikuman: Kessen! Shichinin no Seigi Choujin vs. Uchuu Nobushi,0.0,0.0,0.0,0.330097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491906,0.0,0.0,0.414459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.690861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Souten no Ken Specials,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338678,0.0,0.0,0.285356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.556674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.556674,0.429065,0.0,0.0,0.0
Kanashiki Mongoose,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.636363,0.0,0.0,0.77139,0.0,0.0
Yuusha Tokkyuu Might Gaine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393463,0.0,0.0,0.0,0.0,0.0,0.41532,0.504635,0.0,0.349931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393463,0.0,0.0,0.0,0.375201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kangetsu Ittou: Akuryou Kiri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.598025,0.0,0.0,0.0,0.0,0.273122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532808,0.0,0.0,0.0,0.0
Busou Renkin,0.0,0.0,0.0,0.281099,0.0,0.0,0.498308,0.0,0.0,0.0,0.0,0.0,0.475287,0.0,0.0,0.0,0.41889,0.0,0.0,0.352939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383067,0.0
Huckleberry Finn Monogatari,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Membuat fungsi untuk mendapatkan rekomendasi anime

In [50]:
def anime_recommendations(animeName, similarity_data=cos_sim_df, items=df[['name', 'genre']], k=15):
    # Mengambil data dengan menggunakan argpartition 
    index = similarity_data.loc[:,animeName].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
 
    return pd.DataFrame(closest).merge(items).head(k)

# Testing Model

## Melihat hasil rekomendasi 

In [51]:
anime_recommendation(df, 'One Piece')

10866    One Piece: Episode of Sabo - 3 Kyoudai no Kizu...
8028     One Piece: Episode of Merry - Mou Hitori no Na...
11                                               One Piece
430                                      One Piece Movie 1
994                One Piece: Umi no Heso no Daibouken-hen
5264                                       One Piece Recap
1128     One Piece: Oounabara ni Hirake! Dekkai Dekkai ...
7456                            One Piece: Glorious Island
469             Dragon Ball Movie 1: Shen Long no Densetsu
11162                   One Piece: Adventure of Nebulandia
5515                One Piece Film: Strong World Episode 0
11808            One Piece Film: Gold Episode 0 - 711 ver.
3329     One Piece Movie 9: Episode of Chopper Plus - F...
431          One Piece Movie 2: Nejimaki-jima no Daibouken
Name: name, dtype: object