In [1]:
# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
!pip install surprise
from surprise import Reader, SVD, Dataset
from surprise.model_selection import cross_validate
from sklearn.decomposition import TruncatedSVD
import re
import string



#Setting some useful Default Options <a id='set_options'></a>

In [51]:
# display all columns of the dataframe
pd.options.display.max_columns = None
# display all rows of the dataframe
pd.options.display.max_rows = None
# return an output value upto 6 decimals
pd.options.display.float_format = '{:.6f}'.format

#Reading the Data <a id='Read_Data'></a>

In [52]:
anime = pd.read_csv('C:/Users/Dell/Desktop/anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [53]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [54]:
rating=pd.read_csv('/content/drive/MyDrive/rating.csv',sep=',')
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [55]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [56]:
anime['name'].unique()[3:10]

array(['Steins;Gate', 'Gintama&#039;',
       'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
       'Hunter x Hunter (2011)', 'Ginga Eiyuu Densetsu',
       'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
       'Gintama&#039;: Enchousen'], dtype=object)

In [57]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)
    
    return text

anime['name'] = anime['name'].apply(text_cleaning)

In [58]:
anime['name'] = anime['name'].apply(text_cleaning)

In [59]:
anime['name'].unique()[3:10]

array(['SteinsGate', 'Gintama039',
       'Haikyuu Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
       'Hunter x Hunter 2011', 'Ginga Eiyuu Densetsu',
       'Gintama Movie Kanketsuhen  Yorozuya yo Eien Nare',
       'Gintama039 Enchousen'], dtype=object)

In [60]:
anime.episodes.unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       'Unknown', '39', '101', '47', '50', '62', '33', '112', '23', '3',
       '94', '6', '8', '14', '7', '40', '15', '203', '77', '291', '120',
       '102', '96', '38', '79', '175', '103', '70', '153', '45', '5',
       '21', '63', '52', '28', '145', '36', '69', '60', '178', '114',
       '35', '61', '34', '109', '20', '9', '49', '366', '97', '48', '78',
       '358', '155', '104', '113', '54', '167', '161', '42', '142', '31',
       '373', '220', '46', '195', '17', '1787', '73', '147', '127', '16',
       '19', '98', '150', '76', '53', '124', '29', '115', '224', '44',
       '58', '93', '154', '92', '67', '172', '86', '30', '276', '59',
       '72', '330', '41', '105', '128', '137', '56', '55', '65', '243',
       '193', '18', '191', '180', '91', '192', '66', '182', '32', '164',
       '100', '296', '694', '95', '68', '117', '151', '130',

In [61]:
anime.episodes.replace({'Unknown':np.nan},inplace=True)

###Merging the datasets<a id='merging'></a>

In [62]:
anime_rating=pd.merge(anime,rating,on='anime_id')
anime_rating.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,-1


In [63]:
anime_rating.rename(columns={'rating_x':'avg_rating','rating_y':'user_rating'},inplace=True)
anime_rating.head()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,-1


###Changing Data Types <a id='Data_Types'></a>

In [64]:
anime_rating.dtypes

anime_id         int64
name            object
genre           object
type            object
episodes        object
avg_rating     float64
members          int64
user_id          int64
user_rating      int64
dtype: object

There are 4 `objects`, 1 `float` and 4 `int` data types attributes.

As anime_id and user_id are unique values I need to convert it into `object` data type.

In [65]:
anime_rating['anime_id']=anime_rating.anime_id.astype('object')
anime_rating['user_id']=anime_rating.user_id.astype('object')

In [66]:
anime_rating.dtypes

anime_id        object
name            object
genre           object
type            object
episodes        object
avg_rating     float64
members          int64
user_id         object
user_rating      int64
dtype: object

Now there are 6 `objects`, 2 `int` and 1 `float` attributes.

###Fixing Missing Values <a id='Missing_Values'></a>

In [67]:
missing_value = pd.DataFrame({
    'Missing Value': anime_rating.isnull().sum(),
    'Percentage': (anime_rating.isnull().sum() / len(anime_rating))*100
})

In [68]:
missing_value.sort_values(by='Percentage', ascending=False)

Unnamed: 0,Missing Value,Percentage
genre,110,0.001408
episodes,8,0.000102
avg_rating,6,7.7e-05
type,4,5.1e-05
anime_id,0,0.0
name,0,0.0
members,0,0.0
user_id,0,0.0
user_rating,0,0.0


There are few missing values in `genre`,`episodes`,`avg_rating` and `type`.

In [69]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64

There are few missing values in `genre`,`episodes`,`avg_rating` and `type`.

**Missing Values treatment**

In [70]:
# I can't replace the nan values with mean or median as it's unique for each movie
# So I will drop the null values rows
anime_rating.dropna(inplace=True)
anime.dropna(inplace=True)

In [71]:
anime_rating.isnull().sum()

anime_id       0
name           0
genre          0
type           0
episodes       0
avg_rating     0
members        0
user_id        0
user_rating    0
dtype: int64

In [72]:
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

Now, There are no null values in the dataset.

###RemovingDuplicate Data <a id='duplicate'></a>

In [73]:
duplicate = anime_rating.duplicated(subset=['anime_id','user_id']).sum()
print('There are {} duplicated rows in the data'.format(duplicate))

There are 7 duplicated rows in the data


In [74]:
duplicate = anime.duplicated().sum()
print('There are {} duplicated rows in the data'.format(duplicate))

There are 0 duplicated rows in the data


**Getting rid of duplicate data**

In [75]:
anime_rating.drop_duplicates(subset=['anime_id','user_id'],inplace=True)

**Checking for duplicate data after removal of duplicates**

In [76]:
duplicate = anime_rating.duplicated().sum()
print('There are {} duplicated rows in the data'.format(duplicate))

There are 0 duplicated rows in the data


###Fixing Indices <a id='index'></a>

In [77]:
anime_rating.shape

(7813600, 9)

There are `7813604` records after removing missing values and duplicates.

In [78]:
anime_rating.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
7813722,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,39532,-1
7813723,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,48766,-1
7813724,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,60365,4
7813725,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,142,27364,-1
7813726,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,142,48766,-1


**The last 5 index values range from 7813722-7813726 but I have only 7813604 records thus the indexes need to be reset**

In [79]:
anime_rating.reset_index(drop=True,inplace=True)
anime_rating.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
7813595,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,39532,-1
7813596,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,48766,-1
7813597,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,175,60365,4
7813598,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,142,27364,-1
7813599,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,142,48766,-1


###Final Dataset <a id='final_dataset'></a>

In [80]:
anime_rating.shape

(7813600, 9)

In [81]:
anime_rating.head()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,-1


The final dataset has **7813604 records and 9 features with no missing and duplicate values**

##Analysing and Understanding the Dataset <a id='Data_Understanding'></a>

###Summary Statistics <a id='Summary_Statistics'></a>

**Numeric Variables**

In [82]:
anime_rating.describe()

Unnamed: 0,avg_rating,members,user_rating
count,7813600.0,7813600.0,7813600.0
mean,7.653158,178623.070445,6.144076
std,0.672974,188176.081851,3.727792
min,1.67,29.0,-1.0
25%,7.27,44030.0,6.0
50%,7.68,110470.0,7.0
75%,8.13,244268.0,9.0
max,9.5,1013917.0,10.0


From the above table , I can infer:

1. The minimum avg_rating is 1.67 and the maximum avg_rating is 9.5.

2. The number of members ranges from 29 to 1013917.

3. Users rating ranges from -1 to 10. So, I will replace -1 with NaN and drop the rows.

In [83]:
anime_rating[anime_rating.user_rating==-1].shape

(1476463, 9)

In [84]:
anime_rating.user_rating.replace({-1:np.nan},inplace=True)

In [85]:
anime_rating.isnull().sum()

anime_id             0
name                 0
genre                0
type                 0
episodes             0
avg_rating           0
members              0
user_id              0
user_rating    1476463
dtype: int64

In [86]:
anime_rating.dropna(inplace=True)

In [87]:
anime_rating.isnull().sum()

anime_id       0
name           0
genre          0
type           0
episodes       0
avg_rating     0
members        0
user_id        0
user_rating    0
dtype: int64

In [88]:
anime_rating.shape

(6337137, 9)

In [89]:
anime_rating.describe()

Unnamed: 0,avg_rating,members,user_rating
count,6337137.0,6337137.0,6337137.0
mean,7.675014,184576.391914,7.808545
std,0.669904,190952.794333,1.572436
min,2.0,33.0,1.0
25%,7.29,46803.0,7.0
50%,7.7,117091.0,8.0
75%,8.15,256325.0,9.0
max,9.37,1013917.0,10.0


**Interpretation:**

1. The minimum avg_rating is 2 and the maximum avg_rating is 9.37.

2. The number of members ranges from 33 to 1013917.

3. Users rating ranges from 1 to 10.

**Categorical Variables**

In [90]:
anime_rating.describe(include='object')

Unnamed: 0,anime_id,name,genre,type,episodes,user_id
count,6337137,6337137,6337137,6337137,6337137,6337137
unique,9890,9869,3048,6,177,69600
top,1535,Death Note,"Comedy, School, Slice of Life",TV,12,42635
freq,34226,34226,49850,4364286,1420729,3747


From the above table, I can infer:
    
1. There are `9892` unique anime_id and the most watched anime id is `1535` with frequency `34226`.

2. There are `9892` anime names and the most watched anime name is `Death Note`and `34226` users have been watched.

3. There are `3048` unique genres and `Comedy, School, Slice of Life` is the most repeated genre with frequency `49850`.

4. There are `6` unique values in type attribute and most of type is `TV` with frequency `4364293`.

5. Episodes specify the episode number of that particular anime.

6. There are `69600` users and the user with user_id `42635` is the top most user who have watched most of the animes.i.e. `3747` animes. 

#Recommendation system <a id='rec_sys'></a>

##Popularity Based recommendation system <a id='pop'></a>

In [104]:
anime_rating_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating,totalRatingCount
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5.0,1961
1,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10.0,1961
2,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10.0,1961
3,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10.0,1961
4,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10.0,1961


In [105]:
rating1=anime_rating_data[['anime_id','name','genre','avg_rating','totalRatingCount']]
rating1.drop_duplicates(subset=['anime_id'],inplace=True)
rating1.reset_index(drop=True,inplace=True)
rating1.head()

Unnamed: 0,anime_id,name,genre,avg_rating,totalRatingCount
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",9.37,1961
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,21494
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25,1188
3,9253,SteinsGate,"Sci-Fi, Thriller",9.17,17151
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,3115


In [106]:
def con_recc(df):
  print("min ratings-count")
  min_count= int(input())
  print("min rating")
  min_rating= int(input())
  dfc = df.copy()
  dfc = dfc.loc[(dfc['avg_rating']>=min_rating) & (dfc['totalRatingCount']>=min_count)]
  # mean rating across all the animes
  return dfc



In [107]:
rating1.shape

(9890, 5)

In [108]:
# mean rating across all the animes
C = rating1['avg_rating'].mean()
# animes having total rate count greater than 85%
m = rating1['totalRatingCount'].quantile(0.85)
print('m: ', m, 'C: ', C)

m:  987.6499999999996 C:  6.592642062689574


In [109]:
def Iighted_rating(x, m=m, C=C):
    v = x['totalRatingCount']
    R = x['avg_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [110]:
# animes having total rate count greater than 85% is considered into lists_animes
lists_animes =rating1.copy().loc[rating1['totalRatingCount'] >= m]
lists_animes.shape

(1484, 5)

In [111]:
lists_animes.head()

Unnamed: 0,anime_id,name,genre,avg_rating,totalRatingCount
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",9.37,1961
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,21494
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25,1188
3,9253,SteinsGate,"Sci-Fi, Thriller",9.17,17151
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,3115


In [112]:
lists_animes=con_recc(lists_animes)

min ratings-count
1200
min rating
9


In [113]:
lists_animes.head()

Unnamed: 0,anime_id,name,genre,avg_rating,totalRatingCount
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",9.37,1961
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,21494
3,9253,SteinsGate,"Sci-Fi, Thriller",9.17,17151
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,3115
6,11061,Hunter x Hunter 2011,"Action, Adventure, Shounen, Super Power",9.13,7477


In [114]:
# Define a new feature 'score' and calculate its value with `Iighted_rating()`
lists_animes['score'] = lists_animes.apply(Iighted_rating, axis=1)

In [115]:
lists_animes.head()

Unnamed: 0,anime_id,name,genre,avg_rating,totalRatingCount,score
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",9.37,1961,8.439724
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,21494,9.142819
3,9253,SteinsGate,"Sci-Fi, Thriller",9.17,17151,9.029663
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,3115,8.541948
6,11061,Hunter x Hunter 2011,"Action, Adventure, Shounen, Super Power",9.13,7477,8.833943


In [116]:
#Sort movies based on score calculated above
lists_animes = lists_animes.sort_values('score', ascending=False)
#Print the top 10 movies
lists_animes[['name', 'totalRatingCount', 'avg_rating', 'score']].head(10)

Unnamed: 0,name,totalRatingCount,avg_rating,score
1,Fullmetal Alchemist Brotherhood,21494,9.26,9.142819
3,SteinsGate,17151,9.17,9.029663
10,Clannad After Story,15518,9.06,8.91236
6,Hunter x Hunter 2011,7477,9.13,8.833943
12,Gintama,4264,9.04,8.579738
4,Gintama039,3115,9.16,8.541948
0,Kimi no Na wa,1961,9.37,8.439724
9,Gintama039 Enchousen,2126,9.11,8.311494
8,Gintama Movie Kanketsuhen Yorozuya yo Eien Nare,2147,9.1,8.309994


The above plot shows top 10 animes which can be recommended based on trend.