
# Anime Recommender - Collaborative filtering

This project aims to build an anime recommender using collaborative filtering, 
a method that predicts user preferences by anticipating what someone with similar tastes would also enjoy. 

Collaborative filtering comes in two forms:

- **User-based:** Recommends items by finding similar users and suggesting items that they have liked or interacted with.
- **Item-based:** Recommends items by finding similar items to those that the user has shown interest in. Note that this is different from content-based filtering as the 'similarity' is based on its relationship with users, not the content.

By leveraging user interactions and item similarities, this recommender provides personalized anime recommendations based on user preferences and behaviors.


## Import required libraries

In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.model_selection import train_test_split as train_test_split

## Import cleaned dataset

In [2]:

anime_reviews = pd.read_csv("datasets/anime_review_cleaned.csv")

anime_data = pd.read_csv("datasets/anime_2020_clean.csv")
anime_uid_list = anime_data.uid.unique()

# Attempt to improve anime_reviews
counts = anime_reviews['profile'].value_counts()
anime_reviews_improved = anime_reviews[anime_reviews['profile'].isin(counts[counts >= 10].index)]
print(anime_reviews.shape)
print(anime_reviews_improved.shape)

anime_reviews_uids = anime_reviews_improved.anime_uid.unique()
anime_data_improved = anime_data[anime_data['uid'].isin(anime_reviews_uids)]
print(anime_data.shape)
print(anime_data_improved.shape)


(191091, 4)
(82682, 4)
(8094, 10)
(6635, 10)



# Preparation: Merging the dataset

Before fitting the model, we would first create the user-item matrix as both user-based and item-based uses the same matrix

In [3]:

# Merge data
merged_data = pd.merge(anime_data.drop('score', axis=1), anime_reviews, left_on='uid', right_on='anime_uid')
merged_data_improved = pd.merge(anime_data_improved.drop('score', axis=1), anime_reviews_improved, left_on='uid', right_on='anime_uid')

merged_data_improved.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,link,profile,anime_uid,score,scores
0,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever...","['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","Apr 3, 1998 to Apr 24, 1999",26.0,930311,39,https://myanimelist.net/anime/1/Cowboy_Bebop,RangFlash,1,10,"{'Overall': '10', 'Story': '8', 'Animation': '..."
1,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever...","['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","Apr 3, 1998 to Apr 24, 1999",26.0,930311,39,https://myanimelist.net/anime/1/Cowboy_Bebop,reinis-jan,1,9,"{'Overall': '9', 'Story': '7', 'Animation': '9..."
2,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever...","['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","Apr 3, 1998 to Apr 24, 1999",26.0,930311,39,https://myanimelist.net/anime/1/Cowboy_Bebop,Sephiroth1335,1,8,"{'Overall': '8', 'Story': '8', 'Animation': '8..."
3,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever...","['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","Apr 3, 1998 to Apr 24, 1999",26.0,930311,39,https://myanimelist.net/anime/1/Cowboy_Bebop,iHitokage,1,10,"{'Overall': '10', 'Story': '10', 'Animation': ..."
4,1,Cowboy Bebop,"In the year 2071, humanity has colonized sever...","['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","Apr 3, 1998 to Apr 24, 1999",26.0,930311,39,https://myanimelist.net/anime/1/Cowboy_Bebop,GrimmChicken,1,9,"{'Overall': '9', 'Story': '9', 'Animation': '9..."



# Attempt 1: User-based collaborative filtering

### Create and fit model

In [4]:
# Define rating scale
reader = Reader(rating_scale=(1, 10))

# Load data into Surprise dataset format
data = Dataset.load_from_df(merged_data_improved[['profile', 'uid', 'score']], reader)

# Split data into train and test sets
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Build user-based collaborative filtering model
sim_options = {'name': 'cosine', 'user_based': True}
model_1 = KNNBasic(sim_options=sim_options)
model_1.fit(trainset)



Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x156c55110>

### Evaluating the model

In [5]:
# Make predictions
predictions = model_1.test(testset)

# Calculate RMSE and MAE
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Before filtering
# RMSE: 2.0045
# RMSE: 2.0045033467927986


RMSE: 1.9605
MAE:  1.4844


1.4844099607268595

### Testing the recommendation system

In [6]:
# Lets test to generate recommendations for a specific user
user_profile = 'skrn'

# Find animes user has watched
watched_anime_ids = merged_data[merged_data['profile'] == user_profile]['uid'].values

# Then find animes not watched
not_watched_anime_ids = [uid for uid in anime_uid_list if uid not in watched_anime_ids]

# Predict ratings for items not rated by the user
predicted_ratings = {}
for anime_id in not_watched_anime_ids:
    predicted_rating = model_1.predict(user_profile, anime_id).est
    predicted_ratings[anime_id] = predicted_rating

# Recommend top 10 unwatched animes
top_n = 10
recommended_anime_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:top_n]

recommended_anime_titles = anime_data[anime_data['uid'].isin(recommended_anime_ids)]['title'].values
print("Recommended Anime Titles:\n")
for title in recommended_anime_titles:
    print(title)

Recommended Anime Titles:

Hunter x Hunter: Yorkshin City Kanketsu-hen
Whistle!
Harukanaru Toki no Naka de: Hachiyou Shou
Otogizoushi
Itsudatte My Santa!
Kazemakase Tsukikage Ran
Mahoromatic Summer Special
Dragon Ball Z Movie 13: Ryuuken Bakuhatsu!! Gokuu ga Yaraneba Dare ga Yaru
Dragon Ball Z Special 1: Tatta Hitori no Saishuu Kessen
Bishoujo Senshi Sailor Moon: Sailor Stars


# Attempt 2: Item-based collaborative filtering

### Create and fit model

In [11]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Split the data into training and test sets
train_data, test_data = train_test_split(merged_data_improved, test_size=0.2, random_state=42)

# Create user-item matrix for training data
train_anime_pivot = train_data.pivot_table(index='uid', columns='profile', values='score').fillna(0)

train_anime_matrix = csr_matrix(train_anime_pivot.values)

# Fit the Nearest Neighbors model using cosine similarity on the training data
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(train_anime_matrix)


# Function to find similar items for a given item
def find_similar_animes(anime_id, k=5):
    try:
        query_index = train_anime_pivot.index.get_loc(anime_id)
    except KeyError:
        print(f"Anime ID {anime_id} not found in the index.")
        return []

    distances, indices = model_knn.kneighbors(train_anime_matrix[query_index], n_neighbors=k+1)
    similar_animes = []
    for i in range(1, len(distances.flatten())):
        similar_anime_id = train_anime_pivot.index[indices.flatten()[i]]
        similar_animes.append((similar_anime_id, distances.flatten()[i]))
    return similar_animes

# Example usage: Find 5 similar items for a given item
similar_animes = find_similar_animes(anime_id=13759, k=5)
print("Similar Animes:")
for anime_id, distance in similar_animes:
    anime_title = anime_data[anime_data['uid'] == anime_id]['title'].values[0]
    print(f"Anime ID: {anime_id}, Title: {anime_title}, Distance: {distance}")
    
print(train_anime_pivot.shape)
train_anime_pivot.head()

    
# FOR SELF: RECOMMENDATIONS WHEN USING ALL DATA INSTEAD OF TRAIN SET

# Similar Animes for nichijou:
# Anime ID: 5680, Title: K-On!, Distance: 0.7543458588629648
# Anime ID: 31430, Title: Terra Formars: Revenge, Distance: 0.7633485914705569
# Anime ID: 17549, Title: Non Non Biyori, Distance: 0.7692174000636969
# Anime ID: 1852, Title: Hidamari Sketch, Distance: 0.779888043140977
# Anime ID: 14131, Title: Girls & Panzer, Distance: 0.7826638206931877

# Similar Animes:
# Anime ID: 17549, Title: Non Non Biyori, Distance: 0.3563522929129219
# Anime ID: 13759, Title: Sakura-sou no Pet na Kanojo, Distance: 0.3804144482606372
# Anime ID: 20035, Title: Toaru Majutsu no Index-tan Movie: Endymion no Kiseki - Ga Attari Nakattari, Distance: 0.3804144482606372
# Anime ID: 13311, Title: Henshin Gattai! 5 tsu no Atsuki Tamashii, Distance: 0.3804144482606372
# Anime ID: 5014, Title: Sango Shou Densetsu: Aoi Umi no Elfie, Distance: 0.3804144482606372

#20785

Similar Animes:
Anime ID: 14749, Title: Ore no Kanojo to Osananajimi ga Shuraba Sugiru, Distance: 0.7736209365959625
Anime ID: 14813, Title: Yahari Ore no Seishun Love Comedy wa Machigatteiru., Distance: 0.7994035067618723
Anime ID: 4224, Title: Toradora!, Distance: 0.8291616878790811
Anime ID: 17895, Title: Golden Time, Distance: 0.8334209456073266
Anime ID: 4181, Title: Clannad: After Story, Distance: 0.834313416034999
(6249, 3292)


profile,--Sunclaudius,-Ereya-,-FlameHaze-,-Ghosxuto-,-Haoto-,-HippySnob-,-Lupa-,-Naami-,-Remix-,-Ryu,...,zawa113,zenmodeman,zeralul,zeru02,ziggyopolous,zillion29,zimmercj,zoddtheimmortal,zombie_pegasus,zperson5
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Evaluating the model

In [8]:
from collections import Counter

# Step 1: Get Users in the Test Set
test_users = test_data['profile'].unique()


# Step 2: Identify liked anime for Each User
threshold_score = 6  # Threshold score for it to be considered likeable
liked_anime_dict = {}

for user in test_users:
    user_ratings = test_data[(test_data['profile'] == user) & (test_data['score'] >= threshold_score)] 
    user_ratings_unique = user_ratings.drop_duplicates(subset=['uid'])
#     print("is there difference",len(user_ratings),len(user_ratings_unique))
#     if len(user_ratings)!=len(user_ratings_unique):
#         print('this user gay',user)
    if len(user_ratings_unique) >= 20: # If user has at least 20 favorite animes, add him to dict
        liked_anime_dict[user] = list(zip(user_ratings_unique['uid'].values, user_ratings_unique['score'].values))

        
# Step 3: Use Model to Get Similar Anime and Evaluate
n_closest_anime = 5  # Number of closest anime to retrieve
total_users = len(test_users)
total_hit_rate = 0

for user, liked_anime_list in liked_anime_dict.items():
    
    # Get top 10 animes
    liked_animes_sorted = sorted(liked_anime_list, key=lambda x: x[1], reverse=True)
    top_10_animes = liked_animes_sorted[:10]
    
    similar_anime_counter = Counter()
#     predicted_anime_ids = []
#     for anime_id,score in top_10_animes:
#         similar_animes = find_similar_animes(anime_id, k=n_closest_anime)
#         # Get the id of similar anime
#         predicted_anime_ids.extend([similar_anime[0] for similar_anime in similar_animes]) 

    for anime_id, score in top_10_animes:
        similar_animes = find_similar_animes(anime_id, k=n_closest_anime)
        # Update the counter with similar anime IDs
        for similar_anime_id, _ in similar_animes:
            similar_anime_counter[similar_anime_id] += 1

    # Select the top 10 most frequent similar animes
    top_10_similar_animes = similar_anime_counter.most_common(10)
    top_10_similar_ids = [anime_id for anime_id, _ in top_10_similar_animes]
    
    # Check how many of the users liked animes were hit
    correct_predictions = len(set(anime_id for anime_id, score in liked_anime_list) & set(top_10_similar_ids))
    
    hit_rate_for_user = correct_predictions/10
    total_hit_rate += hit_rate_for_user

# Step 4: Calculate Performance Metric
avg_hit_rate = total_hit_rate / (total_users)

print("Average Hit Rate:", avg_hit_rate)

# If n_closest = 15 : 0.0010688462747563661
# If n_closest = 10 : 0.001257466205595725
# if = 5: 0.00119

Anime ID 631 not found in the index.
Anime ID 32869 not found in the index.
Anime ID 19195 not found in the index.
Anime ID 2680 not found in the index.
Anime ID 6016 not found in the index.
Anime ID 6169 not found in the index.
Anime ID 2558 not found in the index.
Anime ID 668 not found in the index.
Anime ID 18753 not found in the index.
Average Hit Rate: 0.001194592895315939


### Testing by inputing your liked animes

In [12]:
liked_anime_ids = [11061, 22319, 40748, 24833, 30831, 37999, 10165]

similar_anime_counter = Counter()

for anime_id in liked_anime_ids:
    similar_animes = find_similar_animes(anime_id, k=n_closest_anime)
    # Update the counter with similar anime IDs
    for similar_anime_id, _ in similar_animes:
        similar_anime_counter[similar_anime_id] += 1

# Select the top 10 most frequent similar animes
top_10_similar_animes = similar_anime_counter.most_common(10)
top_10_similar_ids = [anime_id for anime_id, _ in top_10_similar_animes]

for anime_id in top_10_similar_ids:
    anime_title = anime_data[anime_data['uid'] == anime_id]['title'].values[0]
    print(f"Anime ID: {anime_id}, Title: {anime_title}")

# # hunter x hunter, tokyo ghoul, Ansatsu_Kyoushitsu, Kono_Subarashii_Sekai_ni_Shukufuku_wo, Kaguya-sama, Nichijou
# liked_anime_ids = [11061, 22319, 40748, 24833, 30831, 37999, 10165]

# anime_inner_id = model.trainset.to_inner_iid(37999)
# get_recommendations(liked_anime_ids, model_1)
# get_recommendations(liked_anime_ids, model_2)
#test
# anime_data[anime_data['uid']==17871]


Anime ID 11061 not found in the index.
Anime ID 40748 not found in the index.
Anime ID: 27899, Title: Tokyo Ghoul √A
Anime ID: 23281, Title: Psycho-Pass 2
Anime ID: 21881, Title: Sword Art Online II
Anime ID: 22199, Title: Akame ga Kill!
Anime ID: 11111, Title: Another
Anime ID: 30654, Title: Ansatsu Kyoushitsu 2nd Season
Anime ID: 38938, Title: Seishun Buta Yarou wa Bunny Girl Senpai no Yume wo Minai Picture Drama
Anime ID: 28825, Title: Himouto! Umaru-chan
Anime ID: 25517, Title: Magic Kaito 1412
Anime ID: 33486, Title: Boku no Hero Academia 2nd Season


### SVD Approach
# PROBABLY NOT USING THIS?

In [None]:
from surprise import SVD
from surprise.model_selection import cross_validate

# Define rating scale
reader = Reader(rating_scale=(1, 10))

# Load data into Surprise dataset format
data = Dataset.load_from_df(merged_data[['profile', 'uid', 'score']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build SVD collaborative filtering model
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions
predictions = model.test(testset)

# Evaluate model
accuracy = rmse(predictions)
print("RMSE:", accuracy)


In [None]:
import seaborn as sns

# Convert predictions to DataFrame
df_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'actual', 'predicted', 'details'])

# Pivot predictions DataFrame to create a user-item matrix
predicted_matrix = df_predictions.pivot(index='uid', columns='iid', values='predicted')

# Create heatmap of predicted ratings
plt.figure(figsize=(10, 8))
sns.heatmap(predicted_matrix, cmap='viridis', cbar=True, linewidths=0.5)
plt.xlabel('Item ID')
plt.ylabel('User ID')
plt.title('Predicted Ratings Heatmap')
plt.show()

