In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

### 1. Load data

In [8]:
#Loading the datasets 
anime_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/anime.csv')
# animelist_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/animelist.csv')
synopsis_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/anime_with_synopsis.csv', low_memory=False)
rating_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/rating_complete.csv', low_memory=False)


### 2. Data processing

In [9]:
# for anime, remove "Unkown" in score column
anime_data = anime_data[(anime_data["Score"] != "Unknown")]

In [10]:
# check for duplicates
dups=anime_data.duplicated(subset='Name')
dups.sum()
# remove duplicates
anime_data = anime_data.drop_duplicates(subset='Name')

In [11]:
anime_data.shape , synopsis_data.shape , rating_data.shape

((12420, 35), (16214, 5), (57633278, 3))

In [12]:
# User should rate atleast 200 animies
users_count = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(users_count[users_count >= 200].index)]
# anime should have >= 500 ratings
anime_count = rating_data['anime_id'].value_counts()
rating_data = rating_data[rating_data['anime_id'].isin(anime_count[anime_count >= 500].index)]

rating_data.head()

Unnamed: 0,user_id,anime_id,rating
189,3,25835,8
190,3,28171,8
191,3,32282,8
192,3,35788,9
193,3,9253,9


In [13]:
# rearrange user_data and anime_data via unique user_id and anime_id
user_ids = rating_data["user_id"].unique().tolist()
user2_encoded = {x: i for i, x in enumerate(user_ids)} # numbering each user
encoded2_user = {i: x for i, x in enumerate(user_ids)} # assigning the user to the number
rating_data["user"] = rating_data["user_id"].map(user2_encoded) # substituing the user values

anime_ids = rating_data['anime_id'].unique().tolist()
anime2_encoded = {x: i for i, x in enumerate(anime_ids)}
encoded2_anime = {i: x for i, x in enumerate(anime_ids)}
rating_data["anime"] = rating_data["anime_id"].map(anime2_encoded)

rating_data.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
189,3,25835,8,0,0
190,3,28171,8,0,1
191,3,32282,8,0,2
192,3,35788,9,0,3
193,3,9253,9,0,4


### 3. Item based CF

In [15]:
matrix = rating_data.pivot_table(index='anime', columns='user', values='rating')
matrix.head()

user,0,1,2,3,4,5,6,7,8,9,...,95133,95134,95135,95136,95137,95138,95139,95140,95141,95142
anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8.0,,,8.0,,,,9.0,8.0,,...,,,,,,,,8.0,,9.0
1,8.0,,,8.0,8.0,,,5.0,8.0,9.0,...,10.0,8.0,8.0,,,,10.0,,8.0,7.0
2,8.0,,,7.0,,,,5.0,8.0,9.0,...,10.0,7.0,8.0,,,,9.0,,8.0,7.0
3,9.0,,,,,,,,8.0,9.0,...,7.0,7.0,9.0,,,,,,,
4,9.0,,9.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,8.0,10.0,,,,,9.0,


In [16]:
#  Data Normalization
# ratings less than the anime's average rating get a negative value
# ratings more than the anime's average rating get a positive value
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

user,0,1,2,3,4,5,6,7,8,9,...,95133,95134,95135,95136,95137,95138,95139,95140,95141,95142
anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.425802,,,-0.425802,,,,0.574198,-0.425802,,...,,,,,,,,-0.425802,,0.574198
1,-0.352946,,,-0.352946,-0.352946,,,-3.352946,-0.352946,0.647054,...,1.647054,-0.352946,-0.352946,,,,1.647054,,-0.352946,-1.352946
2,-0.206242,,,-1.206242,,,,-3.206242,-0.206242,0.793758,...,1.793758,-1.206242,-0.206242,,,,0.793758,,-0.206242,-1.206242
3,0.860715,,,,,,,,-0.139285,0.860715,...,-1.139285,-1.139285,0.860715,,,,,,,
4,-0.142981,,-0.142981,0.857019,0.857019,0.857019,0.857019,0.857019,0.857019,0.857019,...,0.857019,0.857019,-1.142981,0.857019,,,,,-0.142981,


In [17]:
# Item similarity matrix using Pearson correlation
matrix_norm
item_similarity = matrix_norm.T.corr()
item_similarity.head()

anime,0,1,2,3,4,5,6,7,8,9,...,6652,6653,6654,6655,6656,6657,6658,6659,6660,6661
anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.335877,0.314267,0.282987,0.280086,0.268099,0.28045,0.271795,0.276289,0.280263,...,0.30806,0.225613,0.278603,0.134686,0.388502,0.12277,0.222959,0.368336,0.228785,0.237847
1,0.335877,1.0,0.778431,0.716349,0.303394,0.404076,0.4148,0.3378,0.417854,0.455585,...,0.398736,0.334198,0.338846,0.423553,0.339314,0.277058,0.228786,0.368472,0.228963,0.300842
2,0.314267,0.778431,1.0,0.772975,0.284273,0.39527,0.412125,0.356276,0.411382,0.470436,...,0.375268,0.414363,0.376331,0.372885,0.385228,0.265545,0.292349,0.416342,0.168749,0.236979
3,0.282987,0.716349,0.772975,1.0,0.271029,0.401138,0.420278,0.344706,0.4128,0.483674,...,0.324215,0.390787,0.352776,0.339535,0.429241,0.352082,0.211201,0.398305,0.243608,0.246915
4,0.280086,0.303394,0.284273,0.271029,1.0,0.260991,0.274191,0.291669,0.233545,0.219481,...,0.219826,0.21758,0.25031,0.183646,0.311825,0.151057,0.128153,0.326004,0.121445,0.073258


In [None]:
# Item similarity matrix using cosine similarity
# need to fill NaN with 0 to do cosine
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
item_similarity_cosine

In [55]:
def item_based_rec(picked_userid, number_of_similar_items, number_of_recommendations):
  import operator
  # Animes that the target user has not watched
  picked_userid_unwatched = pd.DataFrame(matrix_norm[picked_userid].isna()).reset_index()
  picked_userid_unwatched_anime = picked_userid_unwatched[picked_userid_unwatched[1]==True]['anime'].values.tolist()
  # Animes that the target user has watched
  picked_userid_watched  = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                            .sort_values(ascending=False))\
                            .reset_index()\
                            .rename(columns={1:'rating'})
  
  # Dictionary to save the unwatched anime and predicted rating pair
  rating_prediction ={}  
        
  for i in picked_userid_unwatched_anime: 
    # Calculate the similarity score of the picked anime with other animes
    picked_anime_similarity_score = item_similarity[[i]].reset_index().rename(columns={i:'similarity_score'})
    # Rank the similarities between the picked user watched anime and the picked unwatched movie.
    picked_userid_watched_similarity = pd.merge(left=picked_userid_watched, 
                                                right=picked_anime_similarity_score, 
                                                on='anime', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
    predicted_rating = round(np.average(picked_userid_watched_similarity['rating'], 
                                        weights=picked_userid_watched_similarity['similarity_score']), 6)
    # Save the predicted rating in the dictionary
    rating_prediction[i] = predicted_rating
    # Return the top recommended movies
  return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations]
