## Content Based recommendation system

In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [80]:
df = pd.read_csv("./Anime/anime.csv")
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Exploring the data

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
anime_id    12294 non-null int64
name        12294 non-null object
genre       12232 non-null object
type        12269 non-null object
episodes    12294 non-null object
rating      12064 non-null float64
members     12294 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 672.4+ KB


In [82]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

### For Cotent based recommender system we will be using { Name, Genre, Type} column

In [83]:
# preprocessing
# filling NaN "Type" with unknown
df.loc[(df['type'].isnull()), 'type'] = 'Unknown'

In [84]:
# filling NaN "Genre" with unknown
df.loc[(df['genre'].isnull()), 'genre'] = 'Unknown'

In [85]:
df.isnull().sum()

anime_id      0
name          0
genre         0
type          0
episodes      0
rating      230
members       0
dtype: int64

In [240]:
# Removing punctuation
df['name'] = df['name'].str.replace('[^\w\s]','')
df['name'].head()

0                      Kimi no Na wa
1    Fullmetal Alchemist Brotherhood
2                            Gintama
3                         SteinsGate
4                         Gintama039
Name: name, dtype: object

In [86]:
# Dropping unwanted columns
df.drop(['anime_id', 'rating', 'members', 'episodes'], axis=1, inplace=True)
df.head()

Unnamed: 0,name,genre,type,episodes
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51
3,Steins;Gate,"Sci-Fi, Thriller",TV,24
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51


In [87]:
# Converting to lower case
df['genre'] = df['genre'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head()

Unnamed: 0,name,genre,type,episodes
0,Kimi no Na wa.,"drama, romance, school, supernatural",Movie,1
1,Fullmetal Alchemist: Brotherhood,"action, adventure, drama, fantasy, magic, mili...",TV,64
2,Gintama°,"action, comedy, historical, parody, samurai, s...",TV,51
3,Steins;Gate,"sci-fi, thriller",TV,24
4,Gintama&#039;,"action, comedy, historical, parody, samurai, s...",TV,51


In [76]:
# tokenization of keywords
#from textblob import TextBlob
#df['genre'] = df['genre'].apply(lambda x: TextBlob(x).words)
#df['type'] = df['type'].apply(lambda x: TextBlob(x).words)
#df['episodes'] = df['episodes'].apply(lambda x: TextBlob(x).words)
#df.head()

Unnamed: 0,name,genre,type,episodes
0,kimi no na wa.,"[drama, romance, school, supernatural]",[Movie],[1]
1,fullmetal alchemist: brotherhood,"[action, adventure, drama, fantasy, magic, mil...",[TV],[64]
2,gintama°,"[action, comedy, historical, parody, samurai, ...",[TV],[51]
3,steins;gate,"[sci-fi, thriller]",[TV],[24]
4,gintama&#039;,"[action, comedy, historical, parody, samurai, ...",[TV],[51]


In [75]:
#df.drop(['bag_of_words'], axis=1, inplace=True)
#for index, row in df.iterrows():
#    print(row)

In [219]:
df['bag_of_words'] = ""
for index, row in df.iterrows():
    row['bag_of_words'] = row["genre"] +', '+ row['type'] 
df.head()    

Unnamed: 0,name,genre,type,episodes,bag_of_words
0,Kimi no Na wa.,"drama, romance, school, supernatural",Movie,1,"drama, romance, school, supernatural, Movie"
1,Fullmetal Alchemist: Brotherhood,"action, adventure, drama, fantasy, magic, mili...",TV,64,"action, adventure, drama, fantasy, magic, mili..."
2,Gintama°,"action, comedy, historical, parody, samurai, s...",TV,51,"action, comedy, historical, parody, samurai, s..."
3,Steins;Gate,"sci-fi, thriller",TV,24,"sci-fi, thriller, TV"
4,Gintama&#039;,"action, comedy, historical, parody, samurai, s...",TV,51,"action, comedy, historical, parody, samurai, s..."


In [220]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(df["bag_of_words"])

In [391]:
tfidf

<12294x53 sparse matrix of type '<class 'numpy.float64'>'
	with 53504 stored elements in Compressed Sparse Row format>

In [388]:
cosine_sim = cosine_similarity(tfidf, tfidf)

In [389]:
cosine_sim.shape

(12294, 12294)

In [390]:
# recommender function

def recommendation(name):
    recommended = np.array([])
    #index_ = df.apply(lambda x: x['name'] =='Kimi no Na wa.')
    #index_ = df.select(df.name, filter(df.name == 'Kimi no Na wa.'))
    index_ = df.loc[(df['name'] == name)].index
    score = cosine_sim[index_]
    ind = np.argpartition(score[0], -10)[-11:]
    ind = ind[np.argsort(score[0][ind])]
    print(score[0][ind])
    #for i in ind:
     #   recommended = np.append(recommended, df.loc[(df.index == i)]['name'])        
    return ind

In [281]:
indices = recommendation('Isuca')
print(indices)
df[['name', 'genre', 'type']].loc[indices]

[0.80831755 0.80831755 0.8165576  0.81672076 0.85147338 0.8535023
 0.8795682  0.8795682  0.8795682  0.93692843 1.        ]
[ 6493  1002  3788  5751  1942  2367  2696   498  4196 11096  6312]


Unnamed: 0,name,genre,type
6493,Mujaki no Rakuen,"comedy, ecchi, romance, school, seinen",OVA
1002,Prison School OVA,"comedy, ecchi, romance, school, seinen",OVA
3788,C³,"action, comedy, ecchi, school, supernatural",TV
5751,Isuca Gokuraku,"comedy, ecchi, romance, seinen, supernatural",OVA
1942,Nazo no Kanojo X,"ecchi, romance, school, seinen",TV
2367,Inari Konkon Koi Iroha,"comedy, romance, school, seinen, supernatural",TV
2696,B Gata H Kei,"comedy, ecchi, romance, school, seinen",TV
498,Prison School,"comedy, ecchi, romance, school, seinen",TV
4196,Iketeru Futari,"comedy, ecchi, romance, school, seinen",TV
11096,Tsugumomo,"action, comedy, ecchi, school, seinen, superna...",TV


## TF - IDF implementation without library

In [283]:
# Calculating TF
tf = df['bag_of_words'].apply(lambda x: pd.value_counts(x.split(","))).sum(axis = 0).reset_index()
tf.columns = ['words','tf']
tf

Unnamed: 0,words,tf
0,Movie,2348.0
1,drama,991.0
2,romance,1371.0
3,school,1170.0
4,supernatural,1023.0
5,military,405.0
6,magic,716.0
7,adventure,891.0
8,drama,1025.0
9,fantasy,1815.0


In [286]:
# Calculating IDF
N = df.shape[0]
for i,word in enumerate(tf['words']):
    tf.loc[i, 'idf'] = np.log(N/(len(df[df['bag_of_words'].str.contains(word)])))

In [287]:
tf.head()

Unnamed: 0,words,tf,idf
0,Movie,2348.0,1.655547
1,drama,991.0,1.807996
2,romance,1371.0,2.193571
3,school,1170.0,2.352108
4,supernatural,1023.0,2.486372


In [288]:
tf['tfidf'] = tf['tf'] * tf['idf']
tf

Unnamed: 0,words,tf,idf,tfidf
0,Movie,2348.0,1.655547,3887.225381
1,drama,991.0,1.807996,1791.724024
2,romance,1371.0,2.193571,3007.385756
3,school,1170.0,2.352108,2751.965880
4,supernatural,1023.0,2.486372,2543.558404
5,military,405.0,3.412980,1382.256718
6,magic,716.0,2.843186,2035.721499
7,adventure,891.0,2.624522,2338.449271
8,drama,1025.0,2.484419,2546.529194
9,fantasy,1815.0,1.913026,3472.141955


In [315]:
#tfidf_ = df['bag_of_words'].apply(lambda x: pd.value_counts(x.split(","))).sum(axis = 0).reset_index()
#
tfidf_ = pd.DataFrame(columns=tf['words'])
#tfidf_.columns = 'Names'
tfidf_['Names'] = ''
#tfidf_.drop(['words'], axis=1, inplace=True)
tfidf_.shape

(0, 91)

In [358]:
wordDictA = dict.fromkeys(tf['words'], 0)
for word in df['bag_of_words'][0].split(","):
    wordDictA[word] = tf.loc[(tf.words == word)]['tfidf'][tf.loc[(tf.words == word), 'tfidf'].index[0]]

In [381]:
tfidf_ = pd.DataFrame(columns=tf['words'])
for i in range(N):
    dict_ = dict.fromkeys(tf['words'], 0)
    for word in df['bag_of_words'][i].split(","):
        dict_[word] = tf.loc[(tf.words == word)]['tfidf'][tf.loc[(tf.words == word), 'tfidf'].index[0]]
    #print(dict_)
    
    tfidf_ = tfidf_.append(dict_, ignore_index=True)
    #print(tfidf_)
    #list_of_dicts = list_of_dicts.append([dict_])
tfidf_.head()

words,Movie,drama,romance,school,supernatural,military,magic,adventure,drama.1,fantasy,...,samurai,super power,vampire,space,Unknown,hentai,yaoi,hentai.1,yuri,yaoi.1
0,3887.225381,1791.724024,3007.385756,2751.96588,2543.558404,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1382.256718,2035.721499,2338.449271,2546.529194,3472.141955,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [385]:
cosine_sim_ = cosine_similarity(tfidf_, tfidf_)
cosine_sim_.shape

(12294, 12294)

In [386]:
# recommender function

def recommendation2(name):
    recommended = np.array([])
    #index_ = df.apply(lambda x: x['name'] =='Kimi no Na wa.')
    #index_ = df.select(df.name, filter(df.name == 'Kimi no Na wa.'))
    index_ = df.loc[(df['name'] == name)].index
    score = cosine_sim_[index_]
    ind = np.argpartition(score[0], -10)[-11:]
    ind = ind[np.argsort(score[0][ind])]
    print(score[0][ind])
    #for i in ind:
     #   recommended = np.append(recommended, df.loc[(df.index == i)]['name'])        
    return ind

### Comparing both with and with library tfidf implementation and recommending on the basis of contents

In [396]:
indices = recommendation2('C³ Special')
print(indices)
df[['name', 'genre', 'type']].loc[indices]

[0.8635707  0.8635707  0.8635707  0.8731238  0.88041855 0.88041855
 0.88728761 0.89840972 0.91120327 0.97172697 1.        ]
[4361 3765 4760 7802 2137 1777  815  478 5071  861 3721]


Unnamed: 0,name,genre,type
4361,Freezing Specials,"action, comedy, ecchi",Special
3765,Hikari to Mizu no Daphne Specials,"action, comedy, ecchi",Special
4760,Freezing Vibration Specials,"action, comedy, ecchi",Special
7802,Arcade Gamer Fubuki Extra,"action, adventure, comedy, ecchi, game, parody...",Special
2137,Beelzebub Hirotta Akachan wa Daimaou,"action, comedy, demons, school, shounen, super...",Special
1777,Beelzebub Specials,"action, comedy, demons, school, shounen, super...",Special
815,Kill la Kill Special,"action, comedy, school, super power",Special
478,Durarara Specials,"action, comedy, supernatural",Special
5071,Panty amp Stocking in Sanitarybox,"action, comedy, dementia, ecchi, parody, super...",Special
861,Angel Beats Specials,"action, comedy, school, supernatural",Special


In [395]:
indices = recommendation('C³ Special')
print(indices)
df[['name', 'genre', 'type']].loc[indices]

[0.76452961 0.77493042 0.77493042 0.77493042 0.78901576 0.82785353
 0.82785353 0.85317246 0.87723086 0.90718217 1.        ]
[11096  3765  4361  4760  4881  4253  3874   861  3788  4220  3721]


Unnamed: 0,name,genre,type
11096,Tsugumomo,"action, comedy, ecchi, school, seinen, superna...",TV
3765,Hikari to Mizu no Daphne Specials,"action, comedy, ecchi",Special
4361,Freezing Specials,"action, comedy, ecchi",Special
4760,Freezing Vibration Specials,"action, comedy, ecchi",Special
4881,Seikimatsu Occult Gakuin Specials,"comedy, school, supernatural",Special
4253,MM Specials,"comedy, ecchi, school",Special
3874,Needless Specials,"comedy, ecchi, school",Special
861,Angel Beats Specials,"action, comedy, school, supernatural",Special
3788,C³,"action, comedy, ecchi, school, supernatural",TV
4220,Seirei Tsukai no Blade Dance Specials,"action, ecchi, fantasy, school, supernatural",Special
