In [270]:
import archive
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from termcolor import colored
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

In [271]:
pio.templates.default = "simple_white"
antq_blue = "#336699"
antq_purple = "#ac39ac"
antq_red = "#e63900"
antq_orange = "#ff8c1a"
faded_grey = "#e6e6e6"
light_grey = "#d9d9d9"
heading_font = "Brush Script MT, Brush Script Std, cursive"
normal_font = "times new roman"
palette = px.colors.qualitative.Set2
palette_r = px.colors.qualitative.Set2_r
set1 = px.colors.qualitative.Set1
set3 = px.colors.qualitative.Set3
warnings.filterwarnings('ignore')

DATA SUMMARY - This is some of the data which will be used in this project .

In [272]:
dataset = pd.read_csv(r"Data\anime.csv")
dataset.head()
# this is returning me the data which is present in the dataset .       

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [273]:
Ratings_Dataset = pd.read_csv(r"Data\rating.csv")
Ratings_Dataset.head() 

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [274]:
dataset[dataset.name == 'Pokemon'] # this is how we can see an anime based on the name

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1834,527,Pokemon,"Action, Adventure, Comedy, Fantasy, Kids",TV,276,7.43,229157


In [275]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


The Number of Enteries in the Datasets of Anime and Ratings are as follows: 

In [276]:
print(f'anime shape: {dataset.shape}\nrating shape: {Ratings_Dataset.shape}')

anime shape: (12294, 7)
rating shape: (7813737, 3)


Check the Missing Values

In [277]:
Ratings_Dataset.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [278]:
dataset.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Now that we get the missing rows in both the datasets we can remove them as they are not useful for us .

REMOVING MISSING ROWS

In [279]:
dataset.dropna(axis=0 , inplace=True)
dataset.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [280]:
dataset.describe()

Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


Now let see the episodes which are there in the various animes

In [281]:
dataset.episodes.value_counts()

episodes
1      5571
2      1075
12      810
13      571
26      514
       ... 
358       1
366       1
201       1
172       1
125       1
Name: count, Length: 187, dtype: int64

Now we remove the duplicates from the DATASET after checking for them .

In [282]:
number_of_duplicated_anime = dataset[dataset.duplicated()].shape[0] 
print(f"the number of duplicated anime are:{number_of_duplicated_anime}")

the number of duplicated anime are:0


In [283]:
number_of_duplicated_ratings = Ratings_Dataset[Ratings_Dataset.duplicated()].shape[0] 
print(f"the number of duplicated ratings are:{number_of_duplicated_ratings}")

the number of duplicated ratings are:1


In [284]:
Ratings_Dataset.drop_duplicates(keep='first', inplace=True)
number_of_duplicated_ratings = Ratings_Dataset[Ratings_Dataset.duplicated()].shape[0] 
print(f"the number of duplicated ratings are:{number_of_duplicated_ratings}")

the number of duplicated ratings are:0


CREATING THE DATABASE 