In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import operator
from sklearn.metrics.pairwise import cosine_similarity

### 1. Load data

In [2]:
#Loading the datasets 
anime_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/anime.csv')
# animelist_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/animelist.csv')
synopsis_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/anime_with_synopsis.csv', low_memory=False)
rating_data = pd.read_csv('E:/UIUC/CS512 Data Mining Principles/Project_dataset/archive/rating_complete.csv', low_memory=False)


### 2. Data processing

In [3]:
# for anime, remove "Unkown" in score column
anime_data = anime_data[(anime_data["Score"] != "Unknown")]

In [4]:
# check for duplicates
dups=anime_data.duplicated(subset='Name')
dups.sum()
# remove duplicates
anime_data = anime_data.drop_duplicates(subset='Name')

In [5]:
anime_data.shape , synopsis_data.shape , rating_data.shape

((12420, 35), (16214, 5), (57633278, 3))

In [6]:
# Data is huge, only concern about anmies whose 'popularity' is in top 10% because no one will recommend anime without good popularity
animie_index_pop_top10 = anime_data['MAL_ID'][anime_data['Popularity']< (anime_data.shape[0]*0.1)]
animie_index_pop_top10

0            1
1            5
2            6
5           15
6           16
         ...  
16809    41930
16878    42203
17165    42897
17173    42923
17224    43299
Name: MAL_ID, Length: 1238, dtype: int64

In [7]:
# User should rate atleast 200 animies
users_count = rating_data['user_id'].value_counts()
rating_data = rating_data[rating_data['user_id'].isin(users_count[users_count >= 200].index)]
# anime should have >= 500 ratings
anime_count = rating_data['anime_id'].value_counts()
rating_data = rating_data[rating_data['anime_id'].isin(anime_count[anime_count >= 500].index)]
# anime with popularity top 30%
rating_data = rating_data[rating_data['anime_id'].isin(animie_index_pop_top10)]

rating_data.head()

Unnamed: 0,user_id,anime_id,rating
189,3,25835,8
190,3,28171,8
191,3,32282,8
192,3,35788,9
193,3,9253,9


In [8]:
# add anime name to rating_data
rating_data = pd.merge(rating_data, anime_data.iloc[:,:2], left_on='anime_id', right_on = 'MAL_ID', how='left')

In [9]:
# rearrange user_data and anime_data via unique user_id and anime_id

user_ids = rating_data["user_id"].unique().tolist()
user2_encoded = {x: i for i, x in enumerate(user_ids)} # numbering each user
encoded2_user = {i: x for i, x in enumerate(user_ids)} # assigning the user to the number
rating_data["user"] = rating_data["user_id"].map(user2_encoded) # substituing the user values

anime_ids = rating_data['anime_id'].unique().tolist()
anime2_encoded = {x: i for i, x in enumerate(anime_ids)}
encoded2_anime = {i: x for i, x in enumerate(anime_ids)}
rating_data["anime"] = rating_data["anime_id"].map(anime2_encoded)

rating_data.head()


Unnamed: 0,user_id,anime_id,rating,MAL_ID,Name,user,anime
0,3,25835,8,25835,Shirobako,0,0
1,3,28171,8,28171,Shokugeki no Souma,0,1
2,3,32282,8,32282,Shokugeki no Souma: Ni no Sara,0,2
3,3,35788,9,35788,Shokugeki no Souma: San no Sara,0,3
4,3,9253,9,9253,Steins;Gate,0,4


In [10]:
rating_data.head()

Unnamed: 0,user_id,anime_id,rating,MAL_ID,Name,user,anime
0,3,25835,8,25835,Shirobako,0,0
1,3,28171,8,28171,Shokugeki no Souma,0,1
2,3,32282,8,32282,Shokugeki no Souma: Ni no Sara,0,2
3,3,35788,9,35788,Shokugeki no Souma: San no Sara,0,3
4,3,9253,9,9253,Steins;Gate,0,4


### 3. Item based CF

In [11]:
matrix = rating_data.pivot_table(index='anime', columns='user', values='rating')
matrix.head()

user,0,1,2,3,4,5,6,7,8,9,...,95067,95068,95069,95070,95071,95072,95073,95074,95075,95076
anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8.0,,,8.0,,,,9.0,8.0,,...,,,,,,,,8.0,,9.0
1,8.0,,,8.0,8.0,,,5.0,8.0,9.0,...,10.0,8.0,8.0,,,,10.0,,8.0,7.0
2,8.0,,,7.0,,,,5.0,8.0,9.0,...,10.0,7.0,8.0,,,,9.0,,8.0,7.0
3,9.0,,,,,,,,8.0,9.0,...,7.0,7.0,9.0,,,,,,,
4,9.0,,9.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,8.0,10.0,,,,,9.0,


In [12]:
#  Data Normalization
# ratings less than the anime's average rating get a negative value
# ratings more than the anime's average rating get a positive value
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

user,0,1,2,3,4,5,6,7,8,9,...,95067,95068,95069,95070,95071,95072,95073,95074,95075,95076
anime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.425802,,,-0.425802,,,,0.574198,-0.425802,,...,,,,,,,,-0.425802,,0.574198
1,-0.352946,,,-0.352946,-0.352946,,,-3.352946,-0.352946,0.647054,...,1.647054,-0.352946,-0.352946,,,,1.647054,,-0.352946,-1.352946
2,-0.206242,,,-1.206242,,,,-3.206242,-0.206242,0.793758,...,1.793758,-1.206242,-0.206242,,,,0.793758,,-0.206242,-1.206242
3,0.860715,,,,,,,,-0.139285,0.860715,...,-1.139285,-1.139285,0.860715,,,,,,,
4,-0.142981,,-0.142981,0.857019,0.857019,0.857019,0.857019,0.857019,0.857019,0.857019,...,0.857019,0.857019,-1.142981,0.857019,,,,,-0.142981,


In [13]:
# Item similarity matrix using Pearson correlation
# matrix_norm
# item_similarity = matrix_norm.T.corr()
# item_similarity.head()

In [14]:
# Item similarity matrix using cosine similarity
# need to fill NaN with 0 to do cosine
# we use cosine here, rather than the Pearson correlation
item_similarity = cosine_similarity(matrix_norm.fillna(0))
item_similarity = pd.DataFrame(item_similarity)
item_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212
0,1.000000,0.154791,0.134252,0.109896,0.132099,0.078039,0.075299,0.085632,0.107727,0.091314,...,0.033744,0.062414,0.076767,0.100610,0.034451,0.046567,0.043239,0.019245,0.046178,0.028924
1,0.154791,1.000000,0.665789,0.532843,0.193929,0.174884,0.162593,0.119502,0.242321,0.235356,...,0.125368,0.121081,0.173597,0.154397,0.113525,0.099120,0.070095,0.030118,0.038563,0.061308
2,0.134252,0.665789,1.000000,0.649585,0.163323,0.150973,0.145419,0.114266,0.222675,0.223481,...,0.137589,0.137026,0.167221,0.169299,0.108859,0.109423,0.062995,0.033867,0.042346,0.062206
3,0.109896,0.532843,0.649585,1.000000,0.137004,0.129179,0.125270,0.095894,0.201911,0.203015,...,0.150410,0.151083,0.139196,0.184992,0.094542,0.117518,0.055669,0.033100,0.042025,0.067790
4,0.132099,0.193929,0.163323,0.137004,1.000000,0.119717,0.112335,0.142475,0.116766,0.097084,...,0.035334,0.042980,0.094373,0.075991,0.067005,0.039114,0.078122,0.036739,0.027574,0.039634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208,0.046567,0.099120,0.109423,0.117518,0.039114,0.055838,0.051500,0.036721,0.080400,0.087500,...,0.094016,0.079449,0.067671,0.102675,0.042818,1.000000,0.022610,0.037902,0.026161,0.058306
1209,0.043239,0.070095,0.062995,0.055669,0.078122,0.072211,0.064290,0.078608,0.048663,0.067074,...,0.031717,0.031371,0.065346,0.039012,0.076971,0.022610,1.000000,0.020828,0.020564,0.012439
1210,0.019245,0.030118,0.033867,0.033100,0.036739,0.010916,0.013525,0.026064,0.022063,0.019549,...,0.016416,0.009888,0.025488,0.025332,0.007788,0.037902,0.020828,1.000000,0.074548,0.094401
1211,0.046178,0.038563,0.042346,0.042025,0.027574,0.016086,0.013457,0.026730,0.018721,0.022957,...,0.019258,0.027244,0.030605,0.031386,0.016663,0.026161,0.020564,0.074548,1.000000,0.119343


In [35]:
def item_based_rec(picked_userid, number_of_similar_items, number_of_recommendations):
  # Animes that the target user has not watched
  picked_userid_unwatched = pd.DataFrame(matrix_norm[picked_userid].isna()).reset_index()
  picked_userid_unwatched_anime = picked_userid_unwatched[picked_userid_unwatched.iloc[:,1]==True]['anime'].values.tolist()
  # Animes that the target user has watched
  picked_userid_watched  = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                            .sort_values(ascending=False))\
                            .reset_index()\
                            .rename(columns={picked_userid:'rating'})
  
  # Dictionary to save the unwatched anime and predicted rating pair
  rating_prediction ={}  
        
  for i in picked_userid_unwatched_anime: 
    # Calculate the similarity score of the picked anime with other animes
    picked_anime_similarity_score = item_similarity[[i]].reset_index().rename(columns={'index':'anime',i:'similarity_score'})
    # Rank the similarities between the picked user watched anime and the picked unwatched movie.
    picked_userid_watched_similarity = pd.merge(left=picked_userid_watched, 
                                                right=picked_anime_similarity_score, 
                                                on='anime', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user i
    predicted_rating = round(np.average(picked_userid_watched_similarity['rating'], 
                                        weights=picked_userid_watched_similarity['similarity_score']), 6)
    # Save the predicted rating in the dictionary
    rating_prediction[i] = predicted_rating
    # recommendation stored in res
    res = pd.DataFrame(sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations])
    # rename the columns
  res = res.rename(columns={0: "anime", 1: "rating"})
  # find the corresponding Name and anime_id in df of rating_data
  recommend_Name_Id = pd.merge(left = res, right = rating_data.loc[:,['anime','anime_id','Name']], on = "anime",how = "left")
  recommend_Name_Id = recommend_Name_Id.drop_duplicates()
  order = ['Name','anime','anime_id','rating']
  recommend_Name_Id = recommend_Name_Id[order]
  return recommend_Name_Id 
  

### 4. Test cases

#### Note that
#### 'Name': anime name
#### 'anime': anmie id after rearrange user_data and anime_data via unique user_id and anime_id
#### 'anime_id': the original anime id
#### 'rating': rating after normalization


In [36]:
# if we want to know user_id = 4, finding the 3 most similar items and trying to give 4 top recommendations:
item_based_rec(picked_userid = 4, number_of_similar_items = 3, number_of_recommendations = 4)

Unnamed: 0,Name,anime,anime_id,rating
0,Phantom: Requiem for the Phantom,95,5682,2.094889
17565,Darker than Black: Kuro no Keiyakusha - Sakura...,357,4182,2.077455
34359,Minami-ke,222,2963,1.944326
49994,Berserk 2nd Season,1068,34055,1.806376


In [37]:
# if we want to know user_id = 100, finding the 10 most similar items and trying to give 5 top recommendations:
item_based_rec(picked_userid = 100, number_of_similar_items = 10, number_of_recommendations = 5)

Unnamed: 0,Name,anime,anime_id,rating
0,Space☆Dandy 2nd Season,736,23327,1.941651
9632,One Piece Film: Z,952,12859,1.870215
23963,Coquelicot-zaka kara,1095,10029,1.821146
34576,Neko no Ongaeshi,294,597,1.814156
50411,Kurenai no Buta,293,416,1.810343
