# **Content Based Recommendation System**

In [None]:
# Import libraries
import pandas as pd
import numpy as np

## **Preprocessing the Data**

### Anime Dataset

In [None]:
# Read in the anime dataset
anime_df = pd.read_csv("cleaned_anime.csv")
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
1,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
2,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
3,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10.0,9.15,93351


In [None]:
# We will drop columns that will not be needed
anime_df.drop(["type", "episodes","rating", "members"], axis=1, inplace=True)
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
1,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
2,9253,Steins;Gate,"Sci-Fi, Thriller"
3,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports"


In [None]:
# Drop rows where there are empty values in the "genre" column
anime_df.dropna(subset=["genre"], inplace=True)

In [None]:
# I found that the list of genres that a row contains was inconsistently formatted with some rows having ", " as a delimiter and others ","
anime_df["genre"] = anime_df["genre"].str.replace(", ", ",")

# Then convert the genre column into a list so you can hot one encode the genres.
anime_df["genre"] = anime_df["genre"].str.split(",")

anime_df.head()

Unnamed: 0,anime_id,name,genre
0,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil..."
1,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ..."
2,9253,Steins;Gate,"[Sci-Fi, Thriller]"
3,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samurai, ..."
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"[Comedy, Drama, School, Shounen, Sports]"


In [None]:
# Using scikit learn's MLB package to one hot encode the genres
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)

anime_df = anime_df.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(anime_df["genre"]),
                index=anime_df.index,
                columns=mlb.classes_))

# Drop the origininal genre column
anime_df.drop("genre", axis=1, inplace=True)

In [None]:
anime_df.head()

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,5114,Fullmetal Alchemist: Brotherhood,1,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,28977,Gintama°,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,9253,Steins;Gate,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,9969,Gintama&#039;,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0


## **Ratings Dataset**

In [None]:
rating_df = pd.read_csv("cleaned_rating.csv")
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [None]:
# Remove missing values from the data
rating_df.dropna(inplace=True)

In [None]:
# How many missing values do we have?
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

## **Building the Recommendation System**

In [None]:
# Use the random library to generate a random user id
import random
# Set random seed (for reproducibility)
random.seed(10)

# Pick a random id from the ratings dataset
user = random.randint(rating_df["user_id"].min(), rating_df["user_id"].max())
user

4681

In [None]:
user_df = rating_df[rating_df["user_id"]==4271]

# Reset the indexes
user_df.reset_index(drop=True, inplace=True)
# Drop the columns that are not needed
user_df = user_df.drop("user_id", axis=1)

In [None]:
user_df

Unnamed: 0,anime_id,rating
0,138,10.0
1,139,10.0
2,1535,8.0
3,1604,8.0
4,2251,9.0
5,6746,10.0
6,8129,10.0
7,9863,10.0
8,9969,10.0
9,11771,10.0


In [None]:
user_genre_df = anime_df[anime_df["anime_id"].isin(user_df["anime_id"])]
user_genre_df

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
3,9969,Gintama&#039;,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
28,1535,Death Note,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
48,16894,Kuroko no Basket 2nd Season,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
58,2251,Baccano!,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
59,16498,Shingeki no Kyojin,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
68,13601,Psycho-Pass,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,18115,Magi: The Kingdom of Magic,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
87,11771,Kuroko no Basket,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
112,6746,Durarara!!,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
117,1604,Katekyo Hitman Reborn!,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [None]:
# Sort the genre animes by the anime_id's so that the rows correspond to the same anime in the user's rated dataframe
user_genre_df = user_genre_df.sort_values("anime_id")
user_genre_df.reset_index(drop=True, inplace=True)
user_genre_df

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,1535,Death Note,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,1604,Katekyo Hitman Reborn!,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,2251,Baccano!,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6746,Durarara!!,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,8129,Kuragehime,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,9863,SKET Dance,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,9969,Gintama&#039;,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,11771,Kuroko no Basket,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
8,11843,Danshi Koukousei no Nichijou,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
9,13601,Psycho-Pass,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Drop the animes in the user's rated dataframe that are not categorised by at least 1 genre
user_df.drop([0, 1], axis=0, inplace=True)
user_df.reset_index(drop=True, inplace=True)
user_df

Unnamed: 0,anime_id,rating
0,1535,8.0
1,1604,8.0
2,2251,9.0
3,6746,10.0
4,8129,10.0
5,9863,10.0
6,9969,10.0
7,11771,10.0
8,11843,10.0
9,13601,10.0


In [None]:
user_genre_matrix = user_genre_df.drop(["anime_id", "name"], axis=1)
user_genre_matrix

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Vector
user_df["rating"]

0      8.0
1      8.0
2      9.0
3     10.0
4     10.0
5     10.0
6     10.0
7     10.0
8     10.0
9     10.0
10    10.0
11    10.0
12    10.0
13    10.0
14    10.0
Name: rating, dtype: float64

In [None]:
# Dot product
weights = user_genre_matrix.transpose().dot(user_df["rating"])

weights

Action           87.0
Adventure        20.0
Cars              0.0
Comedy           87.0
Dementia          0.0
Demons            0.0
Drama            10.0
Ecchi             0.0
Fantasy          30.0
Game              0.0
Harem             0.0
Historical       19.0
Horror            0.0
Josei            10.0
Kids              0.0
Magic            20.0
Martial Arts      0.0
Mecha             0.0
Military          0.0
Music             0.0
Mystery          27.0
Parody           10.0
Police           18.0
Psychological    18.0
Romance           0.0
Samurai          10.0
School           50.0
Sci-Fi           20.0
Seinen            9.0
Shoujo            0.0
Shoujo Ai         0.0
Shounen          88.0
Shounen Ai        0.0
Slice of Life    20.0
Space             0.0
Sports           20.0
Super Power      28.0
Supernatural     27.0
Thriller          8.0
Vampire           0.0
dtype: float64

In [None]:
# Set the index of the dataframe to the anime_id
recommendation_table = anime_df.set_index("anime_id")
# Drop the name column
recommendation_table.drop("name", axis=1, inplace=True)
recommendation_table.head()

Unnamed: 0_level_0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5114,1,1,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28977,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9969,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
32935,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [None]:
# Get the weighted average
recommendation_series = (recommendation_table * weights).sum(axis=1) / weights.sum()
recommendation_series.head()

anime_id
5114     0.400943
28977    0.504717
9253     0.044025
9969     0.504717
32935    0.400943
dtype: float64

In [None]:
# Sort in descending order
recommendations = recommendation_series.sort_values(ascending=False)
recommendations.head(10)

anime_id
231      0.734277
25157    0.611635
1186     0.600629
296      0.600629
225      0.597484
1536     0.580189
11703    0.577044
7088     0.569182
6811     0.564465
249      0.564465
dtype: float64

## **Final Result**

In [None]:
# Find the top 10 animes in the recommendations in the anime dataset and put it in a new dataframe
recommendations_df = anime_df.loc[anime_df["anime_id"].isin(recommendations.head(10).keys())]
# Set the index of the dataframe to the anime ids
recommendations_df.set_index("anime_id", inplace=True)
# Use loc and the anime ids of the top 10 anime recommendations to preserve the order and output that to the user
recommendations_df.loc[recommendations.head(10).keys()][["name"]]

Unnamed: 0_level_0,name
anime_id,Unnamed: 1_level_1
231,Asagiri no Miko
25157,Trinity Seven
1186,Battle Athletess Daiundoukai (TV)
296,Dragon Drive
225,Dragon Ball GT
1536,Busou Renkin
11703,Code:Breaker
7088,Ichiban Ushiro no Daimaou
6811,InuYasha: Kanketsu-hen
249,InuYasha
