In [102]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## CSV Import

In [103]:
csv_file_crunchy = "crunchyroll.csv"
csv_file_mal = "myanimelist.csv"
crunchy_df = pd.read_csv(csv_file_crunchy )
mal_df = pd.read_csv(csv_file_mal)

### Drop duplicates in column title, lower de cases, and remove non alphabeticals, and normalize rate, and score

In [104]:
crunchy_df.drop_duplicates(subset ="anime", 
                     keep = "first", inplace = True) 
mal_df.drop_duplicates(subset ="title", 
                     keep ="first" , inplace = True) 

crunchy_df["anime"]=crunchy_df['anime'].str.lower()
mal_df["title"]=mal_df['title'].str.lower()

crunchy_df["anime"]=  crunchy_df['anime'].astype(str).str.replace('[^a-z,0-9," "]', '')
mal_df["title"]=mal_df['title'].astype(str).str.replace('[^a-z,0-9," "]', '')

crunchy_df['rate']= crunchy_df['rate']/5 *100
mal_df['score']=mal_df['score']/10 *100

### Cleaning Crunchyroll

In [105]:
crunchy_df.columns.values

array(['anime', 'anime_url', 'anime_img', 'episodes', 'votes', 'weight',
       'rate', 'rate_1', 'rate_2', 'rate_3', 'rate_4', 'rate_5',
       'genre_action', 'genre_adventure', 'genre_comedy', 'genre_drama',
       'genre_family', 'genre_fantasy', 'genre_food', 'genre_harem',
       'genre_historical', 'genre_horror', 'genre_idols', 'genre_isekai',
       'genre_jdrama', 'genre_magical girls', 'genre_martial arts',
       'genre_mecha', 'genre_music', 'genre_mystery',
       'genre_post-apocalyptic', 'genre_romance', 'genre_sci-fi',
       'genre_seinen', 'genre_sgdrama', 'genre_shojo', 'genre_shonen',
       'genre_slice of life', 'genre_sports', 'genre_supernatural',
       'genre_thriller'], dtype=object)

In [107]:
crunchy_df_2 = pd.melt(crunchy_df, id_vars=['anime', 'anime_url','episodes','rate'], value_vars=['genre_action', 'genre_adventure', 'genre_comedy', 'genre_drama',
       'genre_family', 'genre_fantasy', 'genre_food', 'genre_harem',
       'genre_historical', 'genre_horror', 'genre_idols', 'genre_isekai',
       'genre_jdrama', 'genre_magical girls', 'genre_martial arts',
       'genre_mecha', 'genre_music', 'genre_mystery',
       'genre_post-apocalyptic', 'genre_romance', 'genre_sci-fi',
       'genre_seinen', 'genre_sgdrama', 'genre_shojo', 'genre_shonen',
       'genre_slice of life', 'genre_sports', 'genre_supernatural',
       'genre_thriller'])

In [110]:
crunchy_df_2.head(5)


Unnamed: 0,anime,anime_url,episodes,rate,variable,value
0,naruto shippuuden,https://www.crunchyroll.com/naruto-shippuden,500,96.2,genre_action,0.0
1,shugo chara,https://www.crunchyroll.com/shugo-chara,51,97.2,genre_action,0.0
2,bleach,https://www.crunchyroll.com/bleach,366,96.6,genre_action,0.0
3,naruto,https://www.crunchyroll.com/naruto,220,96.0,genre_action,0.0
4,skip beat,https://www.crunchyroll.com/skip-beat,0,98.2,genre_action,0.0


In [45]:
crunchy_df = crunchy_df[['anime', 'anime_url', 'episodes','rate']].copy()

### Cleaning MyAnimeList

In [106]:
mal_df.columns.values

array(['uid', 'title', 'synopsis', 'genre', 'aired', 'episodes',
       'members', 'popularity', 'ranked', 'score', 'img_url', 'link'],
      dtype=object)

In [108]:
  
# rows list initialization 
title=[]
episodes = []
score =[]
link= []
genre=[]
for index, data in mal_df.iterrows():
  
   
     
    time = data['genre'] 
    time=time.replace('[', '') 
    time=time.replace(']','')
    time= time.split(',')
    
    for row in time:
        genre.append(row)
        title.append(data['title'])
        episodes.append(data['episodes'])
        score.append(data['score'])
        link.append(data['link'])
        


In [109]:
dict = {'title': title,'link':link,'episodes': episodes,'score':score, 'genre':genre} 
mal_df_2 = pd.DataFrame(dict)

In [111]:

mal_df_2.head()



Unnamed: 0,title,link,episodes,score,genre
0,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25.0,88.2,'Comedy'
1,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25.0,88.2,'Sports'
2,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25.0,88.2,'Drama'
3,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25.0,88.2,'School'
4,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25.0,88.2,'Shounen'


# Transformations to data

In [112]:
crunchy_df_2=crunchy_df_2.drop(['value'],axis=1)
crunchy_df_2['variable'] = crunchy_df_2['variable'].str.replace('genre_', '').astype(str)
crunchy_df_2 = crunchy_df_2.rename(columns={"anime": "title","anime_url":"url", "rate":"score","variable":"genre"})


In [113]:
crunchy_df_2.head(5)

Unnamed: 0,title,url,episodes,score,genre
0,naruto shippuuden,https://www.crunchyroll.com/naruto-shippuden,500,96.2,action
1,shugo chara,https://www.crunchyroll.com/shugo-chara,51,97.2,action
2,bleach,https://www.crunchyroll.com/bleach,366,96.6,action
3,naruto,https://www.crunchyroll.com/naruto,220,96.0,action
4,skip beat,https://www.crunchyroll.com/skip-beat,0,98.2,action


In [115]:
mal_df_2['genre'] = mal_df_2['genre'].str.replace("'", '').astype(str)
mal_df_2["genre"]=mal_df_2['genre'].str.lower()
mal_df_2 = mal_df_2[mal_df_2['episodes'].notnull()]
mal_df_2['episodes'] = mal_df_2['episodes'].astype(int)
mal_df_2 = mal_df_2.rename(columns={"link":"url"})

In [116]:
mal_df_2.head(5)

Unnamed: 0,title,url,episodes,score,genre
0,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25,88.2,comedy
1,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25,88.2,sports
2,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25,88.2,drama
3,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25,88.2,school
4,haikyuu second season,https://myanimelist.net/anime/28891/Haikyuu_Se...,25,88.2,shounen


In [117]:
crunchy_df_2.dtypes

title        object
url          object
episodes      int64
score       float64
genre        object
dtype: object

In [118]:
mal_df_2.dtypes

title        object
url          object
episodes      int64
score       float64
genre        object
dtype: object

# Append data frames, and reset index

In [119]:
merge_df =crunchy_df_2.append(mal_df_2, ignore_index=True)
merge_df.shape

(81698, 5)

In [120]:
merge_df.columns

Index(['title', 'url', 'episodes', 'score', 'genre'], dtype='object')

In [121]:
#ready to use dataframe, from here we shoul extract the relational tables for sql database
merge_df

Unnamed: 0,title,url,episodes,score,genre
0,naruto shippuuden,https://www.crunchyroll.com/naruto-shippuden,500,96.2,action
1,shugo chara,https://www.crunchyroll.com/shugo-chara,51,97.2,action
2,bleach,https://www.crunchyroll.com/bleach,366,96.6,action
3,naruto,https://www.crunchyroll.com/naruto,220,96.0,action
4,skip beat,https://www.crunchyroll.com/skip-beat,0,98.2,action
...,...,...,...,...,...
81693,chuunibyou demo koi ga shitai kirameki no slap...,https://myanimelist.net/anime/16934/Chuunibyou...,1,75.6,comedy
81694,chuunibyou demo koi ga shitai kirameki no slap...,https://myanimelist.net/anime/16934/Chuunibyou...,1,75.6,drama
81695,chuunibyou demo koi ga shitai kirameki no slap...,https://myanimelist.net/anime/16934/Chuunibyou...,1,75.6,romance
81696,chuunibyou demo koi ga shitai kirameki no slap...,https://myanimelist.net/anime/16934/Chuunibyou...,1,75.6,school


## Dataframe per table