# Import libraries

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset

In [86]:
df=pd.read_csv("../data/raw/anime.csv")

In [87]:
df.head(1)

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...


# Feature Selection

In [88]:
df.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='str')

In [89]:
df=df[["Name", "Synopsis", "Genres", "Studios", "Source", "Image URL"]]

In [90]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","Action, Award Winning, Sci-Fi",Sunrise,Original,https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...","Action, Sci-Fi",Bones,Original,https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","Action, Adventure, Sci-Fi",Madhouse,Manga,https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"Action, Drama, Mystery, Supernatural",Sunrise,Original,https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,It is the dark century and the people are suff...,"Adventure, Fantasy, Supernatural",Toei Animation,Manga,https://cdn.myanimelist.net/images/anime/7/215...


In [91]:
df.shape

(24905, 6)

# Dataset preprocessing

## 'Genre' feature

In [92]:
def genre_convert(text):
    text= text.lower().replace("unknown"," ")
    genre= text.replace("-"," ").split(",")
    return [g.strip() for g in genre if g.strip()]

In [93]:
df["Genres"]=df["Genres"].apply(genre_convert)

In [94]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","[action, award winning, sci fi]",Sunrise,Original,https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...","[action, sci fi]",Bones,Original,https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","[action, adventure, sci fi]",Madhouse,Manga,https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"[action, drama, mystery, supernatural]",Sunrise,Original,https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,It is the dark century and the people are suff...,"[adventure, fantasy, supernatural]",Toei Animation,Manga,https://cdn.myanimelist.net/images/anime/7/215...


In [95]:
df["Genres"]=df["Genres"].apply(lambda x: [i.replace(" ","") for i in x])

In [96]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","[action, awardwinning, scifi]",Sunrise,Original,https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...","[action, scifi]",Bones,Original,https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","[action, adventure, scifi]",Madhouse,Manga,https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"[action, drama, mystery, supernatural]",Sunrise,Original,https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,It is the dark century and the people are suff...,"[adventure, fantasy, supernatural]",Toei Animation,Manga,https://cdn.myanimelist.net/images/anime/7/215...


## 'Source' feature

In [97]:
def source_convert(text):
    source= text.lower().replace("-"," ").replace(" ","")
    return [source]

In [98]:
df["Source"]=df["Source"].apply(source_convert)

In [99]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","[action, awardwinning, scifi]",Sunrise,[original],https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...","[action, scifi]",Bones,[original],https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","[action, adventure, scifi]",Madhouse,[manga],https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"[action, drama, mystery, supernatural]",Sunrise,[original],https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,It is the dark century and the people are suff...,"[adventure, fantasy, supernatural]",Toei Animation,[manga],https://cdn.myanimelist.net/images/anime/7/215...


## 'Studios' feature

In [100]:
def studios_convert(text):
    text= text.lower().replace("unknown"," ")
    studio= text.replace("-"," ").replace(" ","").replace(".","").split(",")
    return [s.strip() for s in studio if s.strip()]

In [101]:
df["Studios"]=df["Studios"].apply(studios_convert)

In [102]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...","[action, awardwinning, scifi]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...","[action, scifi]",[bones],[original],https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...","[action, adventure, scifi]",[madhouse],[manga],https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,"[action, drama, mystery, supernatural]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,It is the dark century and the people are suff...,"[adventure, fantasy, supernatural]",[toeianimation],[manga],https://cdn.myanimelist.net/images/anime/7/215...


## 'Synopsis' feature

In [103]:
df["Synopsis"]=df["Synopsis"].apply(lambda x:x.split())

In [104]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL
0,Cowboy Bebop,"[Crime, is, timeless., By, the, year, 2071,, h...","[action, awardwinning, scifi]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"[Another, day,, another, bounty—such, is, the,...","[action, scifi]",[bones],[original],https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"[Vash, the, Stampede, is, the, man, with, a, $...","[action, adventure, scifi]",[madhouse],[manga],https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,"[Robin, Sena, is, a, powerful, craft, user, dr...","[action, drama, mystery, supernatural]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,"[It, is, the, dark, century, and, the, people,...","[adventure, fantasy, supernatural]",[toeianimation],[manga],https://cdn.myanimelist.net/images/anime/7/215...


# New Feature: Tags

In [115]:
df["Tags"]=df["Synopsis"]+df["Genres"]+df["Studios"]+df["Source"]
df["Tags"] = df["Tags"].apply(lambda x: " ".join(x))

In [116]:
df.head()

Unnamed: 0,Name,Synopsis,Genres,Studios,Source,Image URL,Tags
0,Cowboy Bebop,"[Crime, is, timeless., By, the, year, 2071,, h...","[action, awardwinning, scifi]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/4/196...,"Crime is timeless. By the year 2071, humanity ..."
1,Cowboy Bebop: Tengoku no Tobira,"[Another, day,, another, bounty—such, is, the,...","[action, scifi]",[bones],[original],https://cdn.myanimelist.net/images/anime/1439/...,"Another day, another bounty—such is the life o..."
2,Trigun,"[Vash, the, Stampede, is, the, man, with, a, $...","[action, adventure, scifi]",[madhouse],[manga],https://cdn.myanimelist.net/images/anime/7/203...,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,"[Robin, Sena, is, a, powerful, craft, user, dr...","[action, drama, mystery, supernatural]",[sunrise],[original],https://cdn.myanimelist.net/images/anime/10/19...,Robin Sena is a powerful craft user drafted in...
4,Bouken Ou Beet,"[It, is, the, dark, century, and, the, people,...","[adventure, fantasy, supernatural]",[toeianimation],[manga],https://cdn.myanimelist.net/images/anime/7/215...,It is the dark century and the people are suff...


# Final Dataset

In [117]:
new_df=df[["Name","Image URL","Tags"]]

In [118]:
new_df.head()

Unnamed: 0,Name,Image URL,Tags
0,Cowboy Bebop,https://cdn.myanimelist.net/images/anime/4/196...,"Crime is timeless. By the year 2071, humanity ..."
1,Cowboy Bebop: Tengoku no Tobira,https://cdn.myanimelist.net/images/anime/1439/...,"Another day, another bounty—such is the life o..."
2,Trigun,https://cdn.myanimelist.net/images/anime/7/203...,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,https://cdn.myanimelist.net/images/anime/10/19...,Robin Sena is a powerful craft user drafted in...
4,Bouken Ou Beet,https://cdn.myanimelist.net/images/anime/7/215...,It is the dark century and the people are suff...


In [119]:
new_df.to_csv("../data/processed/anime_processed.csv",index=False)