In [1]:
import numpy as np
import pandas as pd
import difflib  #to get the movie name resembling closest to the actual movie
from sklearn.feature_extraction.text import TfidfVectorizer  #used to convert the textual data to numerical values
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movie_data = pd.read_csv('C:\\Users\\Acer\\Documents\\anime1.csv')

In [3]:
movie_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Index
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,1
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,2
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,3
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,4


In [4]:
movie_data.shape

(12294, 8)

In [5]:
# selecting the relavent feature for recomendation

selected_features = ['genre', 'type']
print(selected_features)

['genre', 'type']


# REPLACING NULL VALUES WITH NULL STRINGS

In [6]:
# textual data contains a lot of missing values that is null values and we need to replace them with null string.

# Here we are creating a for loop and then replacing the columns that have missing values with null string.
# .fillna('') means is not available, then fill it with null string.

for feature in selected_features:
    movie_data[feature] = movie_data[feature].fillna('')

In [7]:
# combining all the selected features in one table

combined_features = movie_data['genre']+movie_data['type']

# It is like cancatening strings...
# Thats how we join different coloumns of a specific table together.

In [8]:
print(combined_features)

0                Drama, Romance, School, SupernaturalMovie
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                       Sci-Fi, ThrillerTV
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                            HentaiOVA
12290                                            HentaiOVA
12291                                            HentaiOVA
12292                                            HentaiOVA
12293                                          HentaiMovie
Length: 12294, dtype: object


# CONVERTING TEXTUAL DATA TO FEATURE VECTORS

In [9]:
# Means we will be converting the textual data to numberical values because it will be easier to find the cosine realtion.

# Here we are loading the TfidfVectorizer to an variable called vectorizer (just a name). 
vectorizer = TfidfVectorizer()

In [10]:
# we will use the vectorizer variable to fit and transform the data in combined _feature

# This .fit_transform() function will transform the textual strings to numerical values
feature_vector = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vector)

  (0, 237)	0.6841868628985522
  (0, 191)	0.46169683800506506
  (0, 177)	0.42968567661927964
  (0, 46)	0.36618927581429034
  (1, 218)	0.47854032717774675
  (1, 128)	0.48597374495885737
  (1, 115)	0.4345113360492677
  (1, 58)	0.3078298024786575
  (1, 6)	0.28970544990273556
  (1, 0)	0.2667135265176037
  (1, 46)	0.3101677499275295
  (2, 64)	0.3069762712019479
  (2, 198)	0.24692357295632059
  (2, 184)	0.5263412758421881
  (2, 151)	0.44439246347895145
  (2, 86)	0.36084787123253265
  (2, 30)	0.18753080352884044
  (2, 218)	0.3953894263063463
  (2, 0)	0.22036953261572845
  (3, 248)	0.8407551616644995
  (3, 64)	0.42187296366565474
  (3, 198)	0.3393434258424026
  (4, 64)	0.3069762712019479
  (4, 198)	0.24692357295632059
  (4, 184)	0.5263412758421881
  :	:
  (12275, 84)	1.0
  (12276, 84)	1.0
  (12277, 84)	0.8652953840320954
  (12277, 30)	0.5012623049589391
  (12278, 81)	0.5640075842055733
  (12278, 112)	0.6246469970097517
  (12278, 148)	0.3819107055878981
  (12278, 219)	0.3819107055878981
  (12279

# Cosine Similarity

In [12]:
# To find the Similarity Score
# Similarity Score is nothing but that we will get some numerical values for all the different movies.

similarity = cosine_similarity(feature_vector)

# Now the cosine similarity function will traverse through all the values of the feature_vector and find some similarity.
# And the end result is that it can find which movies are related to each other.


# So the Idea is that we are going to convert the textual data to Numerical data and feed that to the cosine_similarity function to find the similarity score values of each movies...

In [13]:
print(similarity)

[[1.         0.1135801  0.         ... 0.         0.         0.        ]
 [0.1135801  1.         0.24798532 ... 0.         0.         0.        ]
 [0.         0.24798532 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [14]:
print(similarity.shape)

(12294, 12294)


# Why (12294, 12294)??? it should have been (12294,2).....¶
NOOOO!!!! Thats because, the cosine_similarity function will compare a particular movie to every movie in the data.... SO for example the Kimi no na wa movie will be compared to the rest 12294 animes and there will be a similarity score for each.... Therefore 12294 for movie name and 12294 for similarity score values....

In [15]:
# getting the movie name from the user...

movie_name = input('Enter your anime name: ')

Enter your anime name: Kimi no nawa


In [16]:
# creating a list with all the movie names given in the data

# .tolist() function will take all the values from the title coloumn and will create a list...
list_of_all_title = movie_data['name'].tolist()
print(list_of_all_title)

['Kimi no Na wa.', 'Fullmetal Alchemist: Brotherhood', 'Gintama°', 'Steins;Gate', 'Gintama&#039;', 'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou', 'Hunter x Hunter (2011)', 'Ginga Eiyuu Densetsu', 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare', 'Gintama&#039;: Enchousen', 'Clannad: After Story', 'Koe no Katachi', 'Gintama', 'Code Geass: Hangyaku no Lelouch R2', 'Haikyuu!! Second Season', 'Sen to Chihiro no Kamikakushi', 'Shigatsu wa Kimi no Uso', 'Mushishi Zoku Shou 2nd Season', 'Ookami Kodomo no Ame to Yuki', 'Code Geass: Hangyaku no Lelouch', 'Hajime no Ippo', 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen', 'Cowboy Bebop', 'One Punch Man', 'Mononoke Hime', 'Suzumiya Haruhi no Shoushitsu', 'Monogatari Series: Second Season', 'Mushishi Zoku Shou', 'Mushishi', 'Tengen Toppa Gurren Lagann', 'Great Teacher Onizuka', 'Natsume Yuujinchou Go', 'Hajime no Ippo: New Challenger', 'Mushishi Zoku Shou: Suzu no Shizuku', 'Natsume Yuujinchou Shi', 'Howl no Ugoku Shiro',

In [17]:
# finding the closest match for the movie name given by the user....

# In order to get the best match, we need to use the difflib lirary...
find_close_match = difflib.get_close_matches(movie_name, list_of_all_title)
print(find_close_match)

['Kimi no Na wa.', 'Kimi no Na wo Yobeba', 'Kamui no Ken']


In [18]:
close_match = find_close_match[0]
print(close_match)

Kimi no Na wa.


In [19]:
# finding the index of the movie with title

index_of_the_movie = movie_data[movie_data.name == close_match]['Index'].values[0]
print(index_of_the_movie)

# The Idea is that we find the index value of the particular movie and store it...
# Later on we use that index and find the similarity score....

0


In [20]:
# getting list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 1.0000000000000002), (1, 0.11358010372690995), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.34093436647631714), (6, 0.0), (7, 0.11568114129178005), (8, 0.0), (9, 0.0), (10, 0.24852821689162208), (11, 0.40199192405109346), (12, 0.0), (13, 0.09842193934306397), (14, 0.34093436647631714), (15, 0.7101056530306671), (16, 0.4812795572049582), (17, 0.0), (18, 0.0), (19, 0.1532273993124774), (20, 0.14769780868650592), (21, 0.20472332259423645), (22, 0.11976664295426982), (23, 0.0), (24, 0.0), (25, 0.6927839287011031), (26, 0.13822907610331686), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.3070589675791703), (31, 0.09871830830963976), (32, 0.14769780868650592), (33, 0.293972051340639), (34, 0.09871830830963976), (35, 0.12946732841966122), (36, 0.0), (37, 0.0), (38, 0.07951166598487713), (39, 0.3783300769694736), (40, 0.0), (41, 0.0), (42, 0.2541783536047771), (43, 0.34093436647631714), (44, 0.14769780868650592), (45, 0.22119385125916935), (46, 0.09871830830963976), (47, 0.0), (48, 0.0), (49, 0.0), (50, 

here the the first coordinate represents the movies index and the second coordinate represents the similarity score with iron man

In [21]:
# Now we need to sort these scores.... 

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

# Here key = lambda x:x[1] means fromt the similarity_score list, take the second coordinate and sort it out....
# reverse = True means sort it in descending order....

[(0, 1.0000000000000002), (1111, 0.9659224537681204), (1494, 0.9029785264934223), (1959, 0.8870377837364795), (894, 0.7760194901446462), (6119, 0.7760194901446462), (5796, 0.7662688248614142), (60, 0.726832508446405), (15, 0.7101056530306671), (11035, 0.696611222641955), (25, 0.6927839287011031), (6069, 0.6841868628985522), (7134, 0.6841868628985522), (7699, 0.6841868628985522), (7731, 0.6841868628985522), (7838, 0.6841868628985522), (7908, 0.6841868628985522), (9058, 0.6841868628985522), (9761, 0.6841868628985522), (9851, 0.6841868628985522), (2381, 0.6463515364461963), (3006, 0.6463515364461963), (2538, 0.6370688881336356), (6418, 0.6370688881336356), (7111, 0.6370688881336356), (7793, 0.6370688881336356), (6463, 0.6230215645574355), (1490, 0.6123259247701971), (70, 0.6119776193862698), (3468, 0.6119776193862698), (7502, 0.6119776193862698), (6147, 0.6042327171897306), (2747, 0.5829456414180555), (3621, 0.5828479573017794), (1233, 0.5815669449113083), (1932, 0.5811656450004791), (109

In [22]:
print('movies suggested are: \n')
i = 0

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movie_data[movie_data.index == index]['name'].values[0]
    if (i<30):
        print(i, '.', title_from_index)
        i+=1

movies suggested are: 

0 . Kimi no Na wa.
1 . Aura: Maryuuin Kouga Saigo no Tatakai
2 . Harmonie
3 . Air Movie
4 . Momo e no Tegami
5 . Shisha no Sho
6 . Taifuu no Noruda
7 . Hotarubi no Mori e
8 . Sen to Chihiro no Kamikakushi
9 . Mahouka Koukou no Rettousei Movie: Hoshi wo Yobu Shoujo
10 . Suzumiya Haruhi no Shoushitsu
11 . Jiu Se Lu
12 . Yuureisen
13 . Kaidan
14 . Shoujouji no Tanuki-bayashi Ban Danemon
15 . Hinomaru Hatanosuke: Bakemonoyashiki no Maki
16 . Issunboushi no Shusse
17 . Kacchikenee!
18 . Nonki na Tou-san Ryuuguu Mairi
19 . Onbu Obake
20 . Sarusuberi: Miss Hokusai
21 . Tezuka Osamu no Buddha: Owarinaki Tabi
22 . Wasurenagumo
23 . Bannou Yasai Ninninman
24 . Ugokie Kori no Tatehiki
25 . Chagama Ondo
26 . Buddha Saitan
27 . Fuse: Teppou Musume no Torimonochou
28 . Bakemono no Ko
29 . Da Yu Hai Tang


In [23]:
movie_name = input('Enter your Anime name: ')

list_of_all_title = movie_data['name'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_title)

close_match = find_close_match[0]

index_of_the_movie = movie_data[movie_data.name == close_match]['Index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Animes suggested are: \n')
i = 0

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movie_data[movie_data.index == index]['name'].values[0]
    if (i<30):
        print(i, '.', title_from_index)
        i+=1

Enter your Anime name: Kimi no nawa
Animes suggested are: 

0 . Kimi no Na wa.
1 . Aura: Maryuuin Kouga Saigo no Tatakai
2 . Harmonie
3 . Air Movie
4 . Momo e no Tegami
5 . Shisha no Sho
6 . Taifuu no Noruda
7 . Hotarubi no Mori e
8 . Sen to Chihiro no Kamikakushi
9 . Mahouka Koukou no Rettousei Movie: Hoshi wo Yobu Shoujo
10 . Suzumiya Haruhi no Shoushitsu
11 . Jiu Se Lu
12 . Yuureisen
13 . Kaidan
14 . Shoujouji no Tanuki-bayashi Ban Danemon
15 . Hinomaru Hatanosuke: Bakemonoyashiki no Maki
16 . Issunboushi no Shusse
17 . Kacchikenee!
18 . Nonki na Tou-san Ryuuguu Mairi
19 . Onbu Obake
20 . Sarusuberi: Miss Hokusai
21 . Tezuka Osamu no Buddha: Owarinaki Tabi
22 . Wasurenagumo
23 . Bannou Yasai Ninninman
24 . Ugokie Kori no Tatehiki
25 . Chagama Ondo
26 . Buddha Saitan
27 . Fuse: Teppou Musume no Torimonochou
28 . Bakemono no Ko
29 . Da Yu Hai Tang
