# Anime Recommendation System
- User-based Collaborative Filtering Technique
- KNN Algorithm using Euclidean distance metric 

## Import Modules & Libraries

In [None]:
import json # Well-formatted Dictionary
import opendatasets as od # Datasets Download from Kaggle
import pandas as pd # Data processing: CSV file I/O
import random # Random Generation of Test User-Rating Data Dictionary
import scipy as sc # Sparse matrix
from sklearn.neighbors import NearestNeighbors # Neighbors-based classification and Searches
import statistics # Mean Calculations
import sweetviz as sv # Exploratory Data Analysis

## Download Datasets

In [None]:
url = 'https://www.kaggle.com/CooperUnion/anime-recommendations-database'
od.download(url)

## Load Datasets as DataFrames
Datasets link: https://www.kaggle.com/CooperUnion/anime-recommendations-database

| file | size | rows | columns |
| --- | --- | --- | --- |
| anime.csv | 914kb | 12294 | 7 |
| rating.csv | 106MB | 7813737 | 3 | 

This data set contains information on user preference data from 73,516 users on 12,294 anime.

In [None]:
DIR = 'anime-recommendations-database/'
animes = pd.read_csv(DIR + "anime.csv", dtype= { 'anime_id': int, 'members': int}, encoding='utf-8-sig')
ratings = pd.read_csv(DIR + "rating.csv", dtype = int)

## Explore and Analyse Data

In [None]:
animes.head() # Read first 5 row of anime DataFrame (DF)

In [None]:
ratings.head() # Read first 5 row of rating DataFrame

In [None]:
animes.info() # Detail information of anime DF

In [None]:
ratings.info() # Detail information of rating DF with missing values

In [None]:
animes.isnull().sum() # Find Columns of anime DF with any missing values

In [None]:
ratings.isnull().sum() # Find Columns of ratings DF with any missing values

In [None]:
len(ratings['user_id'].unique()) # number of users

In [None]:
len(animes['anime_id'].unique()) # number of unique animes in anime DF

In [None]:
episodes = pd.to_numeric(animes['episodes'], errors='coerce').fillna(0) # average number of episodes in anime
statistics.mean(episodes.tolist())

In [None]:
anime_watched_per_user = ratings.groupby('user_id')['rating'].count() # average number of anime rated per user
statistics.mean(anime_watched_per_user.tolist())

In [None]:
anime_watched_per_user.hist(bins=50, range=(-100,500), figsize=(8,5)) # Histogram on distribution of anime-watch per user

In [None]:
ratings_per_anime = ratings.groupby('anime_id')['rating'].count()
statistics.mean(ratings_per_anime.tolist()) #average rating per anime

In [None]:
ratings_per_anime.hist(bins=50, range=(-100,500), figsize=(8,5)) # Histogram on distribution of rating per anime

In [None]:
ratings.rating.value_counts(sort=True).plot(kind='barh', xlabel = 'Anime Rating', figsize=(8,5)) # rating(1-10) count

In [None]:
animes.type.value_counts(sort=True).plot(kind='pie', xlabel = 'Type of Anime', table=True, figsize=(6,6)) # Type Count

In [None]:
num_anime_per_genre = {}
for genre_chunk in animes['genre']:
    granular_genre = [x.strip() for x in str(genre_chunk).split(',')]
    for genre in granular_genre:
        if (genre in num_anime_per_genre.keys()):
            num_anime_per_genre[genre] += 1
        else:
            num_anime_per_genre[genre] = 0

In [None]:
pd.DataFrame(num_anime_per_genre.items(), columns=['Genre', 'Number of Anime']).sort_values(by='Number of Anime', ascending=False).plot(kind='barh', x='Genre', y='Number of Anime', figsize=(10,10)) # Anime per Genre

In [None]:
animes.sort_values(by='rating', ascending=False)[0:30].plot(kind='barh', x='name', y='rating', logx=True, figsize=(10,10)) # Top 30 highest-rated Anime

In [None]:
anime_report = sv.analyze([animes, 'Anime']) # Easy geration of exploratory data analysis report using sweetviz library
anime_report.show_html('anime_report.html', layout='vertical', scale=1.0)

In [None]:
rating_report = sv.analyze([ratings, 'Anime Rating']) 
rating_report.show_html('rating_report.html', layout='vertical', scale=1.0)

In [None]:
anime_rating_comparison = sv.compare([animes, 'Anime'],[ratings, 'Anime Rating'], "anime_id")
anime_rating_comparison.show_html('rating_report.html', layout='vertical', scale=1.0)

## Pre-process Data
*rating*(DF) => *rating*(feature) value(1-10): **-1** means the user did not rated even after watching it (anime) and can be considered invaluable/invalid.

In [None]:
animes.dropna(inplace=True, subset=['genre']) # Dropping nulls if they exist

In [None]:
ratings = ratings[ratings.rating != -1] # Removing user_ratings with rating value -1

In [None]:
ratings = ratings[ratings.anime_id.isin(animes['anime_id'])] # Removing ratings unrelated to any anime in anime DF

In [None]:
ratings.head() # Cleaned rating DF at a glance

In [None]:
ratings.info() # An Updated Info

In [None]:
animes.info()

## Merge Data: (Anime + Rating) DFs
As per the common feature (anime_id), Two Dataframes are merged accordingly.

In [None]:
animes_ = animes[['anime_id', 'name']] # Omiting unnecessary columns from DataFrame
anime_rating_merge = pd.merge(animes_, ratings, on='anime_id') # Table Merge

In [None]:
anime_rating_merge

## Pivot and Reshape the Merged DF
Reshaping/Pivoting a table generates numerous NaN (Not a Number) values. It must be cleaned to avoid exceptions on further processing. Hence the NaN values is replaced by 0.

In [None]:
user_anime_pivot = anime_rating_merge.pivot_table(index = ['user_id'], columns = ['name'], values = 'rating').fillna(0)

In [None]:
user_anime_pivot.info() # Detail information of the User-Anime pivot table

In [None]:
user_anime_pivot.head() # pivot table at a glance

## Convert Pivoted DF to Matrix
Conversion of User-Anime pivot table to a Compressed Sparse Row (CSR) matrix for efficient row(s) calculations.

In [None]:
user_anime_matrix = sc.sparse.csr_matrix(user_anime_pivot.values)

## Construct KNN Model from Matrix
Instantiating the NearestNeighbors class from the Matrix-Array with Euclidean Distance-Metric for implementing Neighbor searches.

In [None]:
model_knn_euclidean = NearestNeighbors(metric='euclidean', algorithm='brute')
model_knn_euclidean.fit(user_anime_matrix)

## Extract Similar Users

In [None]:
seen_anime_ids = []
num_neighbors = 20

> **Option 1**: Retrieve similar users by passing an existing **user_id**

In [None]:
def get_similar_users_by_userid(user_id):
    global seen_anime_ids
    if user_id != 0:
        seen_anime_ids = ratings[ratings.user_id == user_id]['anime_id'].tolist() # Assigning a list of anime_ids from rating DF filtered by user_id argument
    user_rating_reshaped = user_anime_pivot.loc[user_id,:].values.reshape(1,-1) # Reshaping the DF row into an Array
    distances, indices = model_knn_euclidean.kneighbors(user_rating_reshaped, n_neighbors = num_neighbors) # Generating nearest neighbors from model_knn and above array as per num_neighbors argument
    distances, user_ids = distances.flatten(), [user_anime_pivot.index[i] for i in indices.flatten()] # un-wraping the nested-lists of distances, indices. Retrieving a list of user_ids as per indices from User-Anime pivot table 
    print(f'Users similar to {user_id}:\n\nSN.\tUser_ID\t\tDistance')
    for i in range(0, len(distances)):
        print(f'{i}\t{user_ids[i]},\t\t{distances.flatten()[i]}')
    return user_ids

In [None]:
user_id = ratings.user_id.sample().iloc[0]
similar_user_ids = get_similar_users_by_userid(user_id)

> **Option 2**: Retrieve similar users by passing a **dictionary** of <anime_name, rating> pair, faking User rating per anime

In [None]:
def get_similar_users_by_ratings(anime_ratings: dict):
    global seen_anime_ids
    seen_anime_ids = list(anime_ratings.keys())
    user_anime_pivot.loc[-1] = anime_ratings # Inserting the anime ratings in the previous User-Anime pivot table as row with index label -1
    user_anime_pivot.loc[-1] = user_anime_pivot.loc[-1].fillna(0) # Cleaning NaN values of the above inserted row
    return get_similar_users_by_userid(-1)

**Anime-Ratings Faker** : Random generation of anime ratings as per num_anime argument (Note: Its usuage may lead to less recommendations)

In [None]:
def generate_anime_ratings(num_anime):
    anime_ratings = {}
    anime_collection = animes['name'].tolist()
    rating_collection = [i for i in range(1,11)]
    for i in range(0, num_anime):   
        anime_name = ""
        while(True):            
            anime_name = random.choice(anime_collection)
            if(anime_name not in anime_ratings): break
        anime_ratings[anime_name] = random.choice(rating_collection)
    print(json.dumps(['Randomly Generated Anime-Ratings', anime_ratings], indent = 4),'\n')
    return anime_ratings

In [None]:
r = generate_anime_ratings(21)
similar_user_ids = get_similar_users_by_ratings(r)

## Generate Anime Recommendations using Similar Users

In [None]:
ratings_per_similar_user = ratings[ratings.user_id.isin(similar_user_ids) & ~ratings.anime_id.isin(seen_anime_ids)] # Extract ratings of similar user from rating DF having unseen anime_id
anime_rating_group = ratings_per_similar_user[['anime_id', 'rating']].groupby('anime_id', sort=False, as_index=False).mean() # Grouping anime_id and calculating average rating per anime from similar user's rating.
anime_rating_group.sort_values(by='rating', ascending=False, inplace=True) # Sort the above DF by rating (feature) descendingly
anime_recommendations = pd.merge(anime_rating_group['anime_id'], animes, on='anime_id')

In [None]:
def get_anime_recommendations(num_anime): # Generates a fixSed number of anime recommendations as per num_anime argument
    return anime_recommendations[0:num_anime]

In [None]:
get_anime_recommendations(15)