# Week 6 - File ingestion and schema validation

### Import necessary libraries

In [6]:
# this will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

<IPython.core.display.Javascript object>

In [7]:
# loading the dataset
data = pd.read_csv("anime_data_raw.csv")

<IPython.core.display.Javascript object>

In [3]:
# checking the shape of the data
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.")  # f-string

There are 14578 rows and 18 columns.


<IPython.core.display.Javascript object>

In [4]:
# let's view a sample of the data
data.sample(
    10, random_state=2
)  # setting the random_state will ensure we get the same results every time

Unnamed: 0,title,mediaType,eps,duration,ongoing,startYr,finishYr,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes
13764,Spy Penguin (2013): White Christmas,Web,1.0,2min,False,2013.0,2013.0,,,['Next Media Animation'],"['Action', 'Adventure', 'Animal Protagonists',...",[],8.0,0,10,0,,
3782,A Little Snow Fairy Sugar Summer Specials,TV Special,2.0,,False,2003.0,2003.0,,"One day, when Saga finds an old princess costu...",['J.C. Staff'],"['Fantasy', 'Family Friendly', 'Magic', 'Music...",[],1056.0,24,576,16,3.449,571.0
2289,Umineko: When They Cry,TV,26.0,,False,2009.0,2009.0,Summer,"In the year 1986, eighteen members of the Ushi...",['Studio Deen'],"['Horror', 'Mystery', 'Inheritance', 'Island',...",['Explicit Violence'],10896.0,1451,8480,1236,3.787,9463.0
5081,Unbreakable Machine-Doll Specials,DVD Special,6.0,5min,False,2013.0,2014.0,,,['Lerche'],"['Ecchi', 'Fantasy', 'Magic', 'Magic School', ...",['Nudity'],1957.0,201,756,50,3.169,1312.0
9639,Hanako Oku: Hanabi,TV,1.0,6min,False,2015.0,2015.0,,,[],"['Romance', 'Slice of Life']",[],46.0,1,54,1,2.166,33.0
12608,Tamagotchi Honto no Hanashi,Movie,1.0,20min,False,1997.0,1997.0,,,[],"['Fantasy', 'Family Friendly']",[],11.0,2,18,0,,
6735,Violinist of Hamelin Movie,Movie,1.0,30min,False,1996.0,1996.0,,"While on their quest to stop the Demon King, t...",['Nippon Animation'],"['Comedy', 'Fantasy']",[],247.0,6,167,8,2.826,152.0
12846,Neko Kikaku,Movie,1.0,37min,False,2018.0,2018.0,,Nyagoya City is a trendy town where cats live....,['Speed Inc.'],"['Comedy', 'Mystery', 'Animal Protagonists', '...",[],12.0,3,102,2,,
884,Saint Young Men Movie,Movie,1.0,1hr 30min,False,2013.0,2013.0,,Jesus and Buddha are enjoying their vacation i...,['A-1 Pictures'],"['Comedy', 'Fantasy', 'Seinen', 'Slice of Life...",[],2726.0,68,2074,37,4.156,1962.0
10524,Delinquent Hamsters / papalion ft. Piso Studio,Web,1.0,2min,False,2017.0,2017.0,Fall,,['Piso Studio'],"['Comedy', 'Animal Protagonists', 'Delinquents...",[],18.0,0,18,0,1.927,10.0


<IPython.core.display.Javascript object>

* The *duration* column has values in hours and minutes.
* The *studios*, *tags*, and *contentWarn* columns have a list of values.
* There are a lot of missing values.

In [5]:
# creating a copy of the data so that original data remains unchanged
df = data.copy()

<IPython.core.display.Javascript object>

In [6]:
# checking for duplicate values in the data
df.duplicated().sum()

0

<IPython.core.display.Javascript object>

- There are no duplicate values in the data.

In [7]:
# checking the names of the columns in the data
print(df.columns)

Index(['title', 'mediaType', 'eps', 'duration', 'ongoing', 'startYr',
       'finishYr', 'sznOfRelease', 'description', 'studios', 'tags',
       'contentWarn', 'watched', 'watching', 'wantWatch', 'dropped', 'rating',
       'votes'],
      dtype='object')


<IPython.core.display.Javascript object>

In [8]:
# checking column datatypes and number of non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14578 entries, 0 to 14577
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         14578 non-null  object 
 1   mediaType     14510 non-null  object 
 2   eps           14219 non-null  float64
 3   duration      9137 non-null   object 
 4   ongoing       14578 non-null  bool   
 5   startYr       14356 non-null  float64
 6   finishYr      14134 non-null  float64
 7   sznOfRelease  3767 non-null   object 
 8   description   8173 non-null   object 
 9   studios       14578 non-null  object 
 10  tags          14578 non-null  object 
 11  contentWarn   14578 non-null  object 
 12  watched       14356 non-null  float64
 13  watching      14578 non-null  int64  
 14  wantWatch     14578 non-null  int64  
 15  dropped       14578 non-null  int64  
 16  rating        12107 non-null  float64
 17  votes         12119 non-null  float64
dtypes: bool(1), float64(6), in

<IPython.core.display.Javascript object>

* There are many numeric (*float* and *int* type) and string (*object* type) columns in the data.
* Dependent variable is the rating of an anime, which is of *float* type.
* *ongoing* column is of *bool* type.

In [9]:
# checking for missing values in the data.
df.isnull().sum()

title               0
mediaType          68
eps               359
duration         5441
ongoing             0
startYr           222
finishYr          444
sznOfRelease    10811
description      6405
studios             0
tags                0
contentWarn         0
watched           222
watching            0
wantWatch           0
dropped             0
rating           2471
votes            2459
dtype: int64

<IPython.core.display.Javascript object>

* There are missing values in many columns.

In [10]:
# Let's look at the statistical summary of the data
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
title,14578.0,14578.0,Mitsuwano,1.0,,,,,,,
mediaType,14510.0,8.0,TV,4510.0,,,,,,,
eps,14219.0,,,,13.501231,62.262185,1.0,1.0,1.0,12.0,2527.0
duration,9137.0,147.0,4min,964.0,,,,,,,
ongoing,14578.0,2.0,False,14356.0,,,,,,,
startYr,14356.0,,,,2005.457788,14.707105,1907.0,2000.0,2010.0,2016.0,2026.0
finishYr,14134.0,,,,2005.515919,14.656509,1907.0,2000.0,2010.0,2016.0,2026.0
sznOfRelease,3767.0,4.0,Spring,1202.0,,,,,,,
description,8173.0,8108.0,The films will follow The First Summer of Love...,3.0,,,,,,,
studios,14578.0,864.0,[],4808.0,,,,,,,


<IPython.core.display.Javascript object>

* We can see that the anime ratings vary between 0.844 and 4.702, which suggests that the anime were rated on a scale of 0-5.
* *TV* is the most occurring type of media.
* For anime whose season of release is available, *Spring* is the most common season.
* The number of views for the anime in the data has a very wide range (0 to more than 160,000).

### From the data overview, we see that many columns in the data need to be preprocessed before they can be used for analysis.

## Data Preprocessing

### We will drop the missing values in *rating* column as it is the target variable.

In [11]:
df.dropna(subset=["rating"], inplace=True)

<IPython.core.display.Javascript object>

In [12]:
# checking missing values in rest of the data
df.isnull().sum()

title              0
mediaType         63
eps                0
duration        4636
ongoing            0
startYr            6
finishYr         121
sznOfRelease    8560
description     4474
studios            0
tags               0
contentWarn        0
watched          115
watching           0
wantWatch          0
dropped            0
rating             0
votes              0
dtype: int64

<IPython.core.display.Javascript object>

### Let us look at the entries with no start year.

In [13]:
df[df.startYr.isnull()]

Unnamed: 0,title,mediaType,eps,duration,ongoing,startYr,finishYr,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes
1405,Unbelievable Space Love,Web,10.0,1min,False,,,,,[],"['BL', 'Romance', 'Shounen-ai', 'Dream World',...",[],90.0,16,343,0,4.012,54.0
5225,Manbo-P: Irokoizata wa Subete Sakuzu de Kaiket...,Music Video,1.0,5min,False,,,,,[],"['Comedy', 'Romance', 'Vocaloid']",[],41.0,0,25,0,3.139,20.0
9821,Mameshiba: Mamerry Christmas,Other,1.0,1min,False,,,,,[],"['Christmas', 'Family Friendly', 'Shorts', 'St...",[],57.0,1,17,0,2.119,35.0
10270,Meow no Hoshi,Other,1.0,5min,False,,,,,[],"['Drama', 'Animal Protagonists', 'Cats', 'Mela...",[],40.0,0,25,0,1.999,25.0
11982,Landmark,Web,1.0,4min,False,,,,,[],"['Abstract', 'Black and White']",[],34.0,0,9,0,1.256,21.0
12089,Burutabu-chan,Other,3.0,1min,False,,,,,[],"['Comedy', 'Crude', 'Short Episodes']",[],46.0,1,10,1,1.046,33.0


<IPython.core.display.Javascript object>

* We will drop the entries with no start year as this is a difficult column to impute.
* The decision to drop these missing values or impute them by a suitable value is subject to domain knowledge, and based on the steps taken to deal with them, the model performance will vary.

In [14]:
df.dropna(subset=["startYr"], inplace=True)

<IPython.core.display.Javascript object>

In [15]:
# let us reset the dataframe index
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

In [16]:
# checking missing values in rest of the data
df.isnull().sum()

title              0
mediaType         63
eps                0
duration        4636
ongoing            0
startYr            0
finishYr         115
sznOfRelease    8554
description     4468
studios            0
tags               0
contentWarn        0
watched          115
watching           0
wantWatch          0
dropped            0
rating             0
votes              0
dtype: int64

<IPython.core.display.Javascript object>

### Let us look at the entries with no finish year.

In [17]:
df[df.finishYr.isnull()]

Unnamed: 0,title,mediaType,eps,duration,ongoing,startYr,finishYr,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes
13,Kaguya-sama: Love Is War?,TV,10.0,,True,2020.0,,Spring,The battle between love and pride continues! N...,['A-1 Pictures'],"['Comedy', 'Drama', 'Romance', 'Seinen', 'Epis...",[],,6368,5747,96,4.617,2359.0
46,Douluo Dalu 2,Web,82.0,22min,True,2018.0,,,Second season of Douluo Dalu.,[],"['Action', 'Fantasy', 'Chinese Animation', 'Cu...",[],,1167,990,32,4.54,549.0
70,Fruits Basket 2nd Season,TV,10.0,,True,2020.0,,Spring,Second season of Fruits Basket.,"['TMS Entertainment', '8 Pan']","['Comedy', 'Drama', 'Fantasy', 'Romance', 'Sho...",[],,4160,4427,55,4.527,1194.0
111,Ascendance of a Bookworm: Part II,TV,11.0,,True,2020.0,,Spring,"With her baptism ceremony complete, Myne begin...",['Ajia-do'],"['Drama', 'Fantasy', 'Apprenticeship', 'Cheats...",[],,3183,1916,29,4.483,1139.0
115,Rakshasa Street 2nd Season,Web,5.0,,True,2019.0,,,,[],"['Action', 'Shounen', 'Chinese Animation', 'Su...",[],,47,102,0,4.482,10.0
121,Kingdom 3,TV,4.0,,True,2020.0,,Spring,Third season of Kingdom.,"['Studio Pierrot', 'St. Signpost']","['Action', 'Drama', 'Seinen', 'Ancient China',...",[],,515,740,14,4.476,202.0
239,One Piece,TV,929.0,,True,1999.0,,Fall,Long ago the infamous Gol D. Roger was the str...,['Toei Animation'],"['Action', 'Adventure', 'Comedy', 'Fantasy', '...",[],,74537,16987,12445,4.402,59737.0
262,Tower of God,TV,11.0,,True,2020.0,,Spring,Fame. Glory. Power. Anything in your wildest d...,['Telecom Animation Film'],"['Action', 'Adventure', 'Drama', 'Fantasy', 'H...","['Mature Themes', 'Suicide', 'Violence']",,9568,5085,187,4.391,3387.0
314,Wu Geng Ji 3rd Season,Web,21.0,,True,2019.0,,,Third season of Wu Geng Ji.,[],"['Action', 'Fantasy', 'Ancient China', 'Chines...",[],,50,140,1,4.366,19.0
324,A Certain Scientific Railgun T,TV,15.0,,True,2020.0,,Winter,Mikoto Misaka and her friends prepare for the ...,['J.C. Staff'],"['Action', 'Sci Fi', 'Elemental Powers', 'Psyc...",[],,1825,2939,43,4.365,638.0


<IPython.core.display.Javascript object>

In [18]:
# checking the summary of the data with missing values in finishYr
df[df.finishYr.isnull()].describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
title,115.0,115.0,GJ8 Man,1.0,,,,,,,
mediaType,115.0,6.0,TV,64.0,,,,,,,
eps,115.0,,,,136.521739,408.981219,1.0,4.5,10.0,22.0,2527.0
duration,50.0,18.0,1min,8.0,,,,,,,
ongoing,115.0,1.0,True,115.0,,,,,,,
startYr,115.0,,,,2016.521739,8.053928,1969.0,2018.0,2020.0,2020.0,2020.0
finishYr,0.0,,,,,,,,,,
sznOfRelease,75.0,4.0,Spring,50.0,,,,,,,
description,79.0,79.0,Fourth season of Quanzhi Fashi.,1.0,,,,,,,
studios,115.0,66.0,[],23.0,,,,,,,


<IPython.core.display.Javascript object>

* More than 25% of the entries with missing finish year are have started on or after 2018.
* So, we will assume that the anime with missing values in *finishYr* are still airing, and fill the values with 2020 (the year the data was collected).
* You can experiment by dropping the entries where the finish year is missing.
* The decision to drop these missing values or impute them by a suitable value is subject to domain knowledge, and based on the steps taken to deal with them, the model performance will vary.

In [19]:
df["finishYr"].fillna(2020, inplace=True)

# checking missing values in rest of the data
df.isnull().sum()

title              0
mediaType         63
eps                0
duration        4636
ongoing            0
startYr            0
finishYr           0
sznOfRelease    8554
description     4468
studios            0
tags               0
contentWarn        0
watched          115
watching           0
wantWatch          0
dropped            0
rating             0
votes              0
dtype: int64

<IPython.core.display.Javascript object>

* The missing values in startYr and finishYr columns have been dealt with.
* **We will now create a new variable *years_running*, which will be calculated as *finishYr* minus *startYr*.**
* **We will also drop the *finishYr* and *startYr* columns.**

In [20]:
df["years_running"] = df["finishYr"] - df["startYr"]
df.drop(["startYr", "finishYr"], axis=1, inplace=True)
df.head()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running
0,Fullmetal Alchemist: Brotherhood,TV,64.0,,False,Spring,The foundation of alchemy is based on the law ...,['Bones'],"['Action', 'Adventure', 'Drama', 'Fantasy', 'M...","['Animal Abuse', 'Mature Themes', 'Violence', ...",103707.0,14351,25810,2656,4.702,86547.0,1.0
1,your name.,Movie,1.0,1hr 47min,False,,Mitsuha and Taki are two total strangers livin...,['CoMix Wave Films'],"['Drama', 'Romance', 'Body Swapping', 'Gender ...",[],58831.0,1453,21733,124,4.663,43960.0,0.0
2,A Silent Voice,Movie,1.0,2hr 10min,False,,"After transferring into a new school, a deaf g...",['Kyoto Animation'],"['Drama', 'Shounen', 'Disability', 'Melancholy...","['Bullying', 'Mature Themes', 'Suicide']",45892.0,946,17148,132,4.661,33752.0,0.0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,TV,10.0,,False,Fall,"Picking up where the second season ended, the ...",['Production I.G'],"['Shounen', 'Sports', 'Animeism', 'School Club...",[],25134.0,2183,8082,167,4.66,17422.0,0.0
4,Attack on Titan 3rd Season: Part II,TV,10.0,,False,Spring,The battle to retake Wall Maria begins now! Wi...,['Wit Studio'],"['Action', 'Fantasy', 'Horror', 'Shounen', 'Da...","['Cannibalism', 'Explicit Violence']",21308.0,3217,7864,174,4.65,15789.0,0.0


<IPython.core.display.Javascript object>

### Let's convert the duration column from string to numeric.

In [21]:
# we define a function to convert the duration column to numeric


def time_to_minutes(var):
    if isinstance(var, str):  # checking if the value is string or not
        if "hr" in var:  # checking for the presence of hours in the duration
            spl = var.split(" ")  # splitting the value by space
            hr = (
                float(spl[0].replace("hr", "")) * 60
            )  # taking numeric part and converting hours to minutes
            mt = float(spl[1].replace("min", ""))  # taking numeric part of minutes
            return hr + mt
        else:
            return float(var.replace("min", ""))  # taking numeric part of minutes
    else:
        return np.nan  # will return NaN if value is not string

<IPython.core.display.Javascript object>

In [22]:
# let's apply the function to the duration column and overwrite the column
df["duration"] = df["duration"].apply(time_to_minutes)
df.head()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running
0,Fullmetal Alchemist: Brotherhood,TV,64.0,,False,Spring,The foundation of alchemy is based on the law ...,['Bones'],"['Action', 'Adventure', 'Drama', 'Fantasy', 'M...","['Animal Abuse', 'Mature Themes', 'Violence', ...",103707.0,14351,25810,2656,4.702,86547.0,1.0
1,your name.,Movie,1.0,107.0,False,,Mitsuha and Taki are two total strangers livin...,['CoMix Wave Films'],"['Drama', 'Romance', 'Body Swapping', 'Gender ...",[],58831.0,1453,21733,124,4.663,43960.0,0.0
2,A Silent Voice,Movie,1.0,130.0,False,,"After transferring into a new school, a deaf g...",['Kyoto Animation'],"['Drama', 'Shounen', 'Disability', 'Melancholy...","['Bullying', 'Mature Themes', 'Suicide']",45892.0,946,17148,132,4.661,33752.0,0.0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,TV,10.0,,False,Fall,"Picking up where the second season ended, the ...",['Production I.G'],"['Shounen', 'Sports', 'Animeism', 'School Club...",[],25134.0,2183,8082,167,4.66,17422.0,0.0
4,Attack on Titan 3rd Season: Part II,TV,10.0,,False,Spring,The battle to retake Wall Maria begins now! Wi...,['Wit Studio'],"['Action', 'Fantasy', 'Horror', 'Shounen', 'Da...","['Cannibalism', 'Explicit Violence']",21308.0,3217,7864,174,4.65,15789.0,0.0


<IPython.core.display.Javascript object>

In [23]:
# let's check the summary of the duration column
df["duration"].describe()

count    7465.000000
mean       24.230141
std        31.468171
min         1.000000
25%         4.000000
50%         8.000000
75%        30.000000
max       163.000000
Name: duration, dtype: float64

<IPython.core.display.Javascript object>

* 50% of the anime in the data have a runtime less than or equal to 8 minutes.


* Some anime even have a runtime of 1 minute.
    - This seems strange at first, but a Google search can reveal that there are indeed such anime.

### We will fill the missing values in the *sznOfRelease* column with '*is_missing*', which will act as a new category.

In [24]:
df["sznOfRelease"].fillna("is_missing", inplace=True)
df.isnull().sum()

title               0
mediaType          63
eps                 0
duration         4636
ongoing             0
sznOfRelease        0
description      4468
studios             0
tags                0
contentWarn         0
watched           115
watching            0
wantWatch           0
dropped             0
rating              0
votes               0
years_running       0
dtype: int64

<IPython.core.display.Javascript object>

**Let's check the number of unique values and the number of times they occur for the *mediaType* column.**

In [25]:
df.mediaType.value_counts()

TV             3993
Movie          1928
OVA            1770
Music Video    1290
Web            1170
DVD Special     803
Other           580
TV Special      504
Name: mediaType, dtype: int64

<IPython.core.display.Javascript object>

### We will fill the missing values in the *mediaType* column with '*Other*', as the exact values for that category are not known.

In [26]:
df.mediaType.fillna("Other", inplace=True)

# checking the number of unique values and the number of times they occur
df.mediaType.value_counts()

TV             3993
Movie          1928
OVA            1770
Music Video    1290
Web            1170
DVD Special     803
Other           643
TV Special      504
Name: mediaType, dtype: int64

<IPython.core.display.Javascript object>

**We saw that *studios*, *tags*, and *contentWarn* columns have  a list of values.**


### Let us remove the leading and trailing square braces from the values in those columns.


**We will also replace the entries with blank lists in these columns with *NaN*.**

In [27]:
cols_with_list_vals = ["studios", "tags", "contentWarn"]

for col in cols_with_list_vals:
    df[col] = (
        df[col].str.lstrip("[").str.rstrip("]")
    )  # remove the leading and trailing square braces
    df[col] = df[col].replace("", np.nan)  # mark as NaN if the value is a blank string

df.head()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running
0,Fullmetal Alchemist: Brotherhood,TV,64.0,,False,Spring,The foundation of alchemy is based on the law ...,'Bones',"'Action', 'Adventure', 'Drama', 'Fantasy', 'My...","'Animal Abuse', 'Mature Themes', 'Violence', '...",103707.0,14351,25810,2656,4.702,86547.0,1.0
1,your name.,Movie,1.0,107.0,False,is_missing,Mitsuha and Taki are two total strangers livin...,'CoMix Wave Films',"'Drama', 'Romance', 'Body Swapping', 'Gender B...",,58831.0,1453,21733,124,4.663,43960.0,0.0
2,A Silent Voice,Movie,1.0,130.0,False,is_missing,"After transferring into a new school, a deaf g...",'Kyoto Animation',"'Drama', 'Shounen', 'Disability', 'Melancholy'...","'Bullying', 'Mature Themes', 'Suicide'",45892.0,946,17148,132,4.661,33752.0,0.0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,TV,10.0,,False,Fall,"Picking up where the second season ended, the ...",'Production I.G',"'Shounen', 'Sports', 'Animeism', 'School Club'...",,25134.0,2183,8082,167,4.66,17422.0,0.0
4,Attack on Titan 3rd Season: Part II,TV,10.0,,False,Spring,The battle to retake Wall Maria begins now! Wi...,'Wit Studio',"'Action', 'Fantasy', 'Horror', 'Shounen', 'Dar...","'Cannibalism', 'Explicit Violence'",21308.0,3217,7864,174,4.65,15789.0,0.0


<IPython.core.display.Javascript object>

In [28]:
# checking missing values in rest of the data
df.isnull().sum()

title                0
mediaType            0
eps                  0
duration          4636
ongoing              0
sznOfRelease         0
description       4468
studios           3208
tags               313
contentWarn      10705
watched            115
watching             0
wantWatch            0
dropped              0
rating               0
votes                0
years_running        0
dtype: int64

<IPython.core.display.Javascript object>

### Treating the *studios* column

In [29]:
df.sample(
    10, random_state=2
)  # setting the random_state will ensure we get the same results every time

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running
7002,Tales of the Rays: Mirrage Prison,Web,1.0,1.0,False,is_missing,,,"'Fantasy', 'Promotional'",,81.0,3,80,1,2.767,38.0,0.0
11871,Onikiri Shoujo,Web,1.0,1.0,False,is_missing,,,"'Action', 'Black and White', 'Hand to Hand Com...",,44.0,0,25,1,1.359,25.0,0.0
7492,Triage X,TV,10.0,,False,Spring,Mochizuki General Hospital boasts some of the ...,'XEBEC',"'Action', 'Ecchi', 'Harem', 'Shounen', 'Assass...","'Drug Use', 'Explicit Sex', 'Mature Themes', '...",4129.0,871,2867,788,2.665,3485.0,0.0
3852,Rainbow Days OVA,OVA,1.0,,False,is_missing,,'Ashi Productions',"'Romance', 'Shoujo', 'School Life', 'Based on ...",,580.0,24,987,8,3.432,291.0,0.0
4506,Endro~!,TV,12.0,,False,Winter,"In the land of Naral Island, a land of magic a...",'Studio Gokumi',"'Fantasy', 'Slice of Life', 'Magic', 'RPG', 'O...",,1033.0,372,1205,254,3.29,976.0,0.0
9863,Heybot!,TV,50.0,,False,Summer,"The story takes place on Screw Island, a screw...",'BN Pictures',"'Comedy', 'Family Friendly', 'Island', 'Non-Hu...",,33.0,14,62,36,2.107,54.0,1.0
3513,Unico,Movie,1.0,90.0,False,is_missing,Unico is a special unicorn with the ability to...,'MADHOUSE',"'Adventure', 'Fantasy', 'Animal Protagonists',...",,748.0,7,371,17,3.508,459.0,0.0
10605,Ali Baba to 40-hiki no Touzoku,Movie,1.0,56.0,False,is_missing,"Generations ago, the wily Ali Baba stole a cav...",'Toei Animation',"'Adventure', 'Family Friendly', 'Middle Easter...",,305.0,7,99,12,1.897,147.0,0.0
10270,rerulili: Girls Talk,Music Video,1.0,4.0,False,is_missing,,,,,18.0,0,5,0,1.995,13.0,0.0
2942,Majestic Prince Movie: Kakusei no Idenshiko,Movie,1.0,,False,is_missing,,"'Seven Arcs Pictures', 'Orange'","'Action', 'Drama', 'Mecha', 'Sci Fi', 'Seinen'...",,261.0,11,423,8,3.634,168.0,0.0


<IPython.core.display.Javascript object>

* We can see that row 2942 has more than one studio, which indicates a collaboration between studios.
* We will split the tags column by ', ' and take all the values in one dataframe for further analysis.

In [30]:
studio_df = pd.DataFrame(
    df.studios.str.split(", ", expand=True).values.flatten(), columns=["Studios"]
)
val_c = studio_df.Studios.value_counts()
val_c

'Toei Animation'                         636
'Sunrise'                                433
'J.C. Staff'                             341
'MADHOUSE'                               339
'TMS Entertainment'                      319
                                        ... 
'ASK Animation'                            1
'Rockwell Eyes'                            1
'Candy Box'                                1
'Children’s Playground Entertainment'      1
'DOGA Productions'                         1
Name: Studios, Length: 488, dtype: int64

<IPython.core.display.Javascript object>

* There are too many studios in the data, and adding them all as separate columns will make our data dimension very large.
* We will use a threshold, and keep only those studios with at least as many entries as the threshold.

In [31]:
# we take 100 as threshold
threshold = 100
val_c[val_c.values >= threshold]

'Toei Animation'          636
'Sunrise'                 433
'J.C. Staff'              341
'MADHOUSE'                339
'TMS Entertainment'       319
'Production I.G'          279
'Studio Deen'             266
'Studio Pierrot'          223
'OLM'                     216
'A-1 Pictures'            194
'AIC'                     167
'Shin-Ei Animation'       165
'Tatsunoko Production'    146
'Nippon Animation'        145
'XEBEC'                   143
'DLE'                     134
'GONZO'                   132
'Bones'                   122
'Shaft'                   119
'Kyoto Animation'         108
Name: Studios, dtype: int64

<IPython.core.display.Javascript object>

* 100 looks to be a good threshold.
* We will keep only those studios that have created more than 100 anime, and the rest we will assign as '*Others*'.
* You can experiment by using a different threshold.

In [32]:
# list of studios
studios_list = val_c[val_c.values >= threshold].index.tolist()
print("Studio names taken into consideration:", len(studios_list), studios_list)

Studio names taken into consideration: 20 ["'Toei Animation'", "'Sunrise'", "'J.C. Staff'", "'MADHOUSE'", "'TMS Entertainment'", "'Production I.G'", "'Studio Deen'", "'Studio Pierrot'", "'OLM'", "'A-1 Pictures'", "'AIC'", "'Shin-Ei Animation'", "'Tatsunoko Production'", "'Nippon Animation'", "'XEBEC'", "'DLE'", "'GONZO'", "'Bones'", "'Shaft'", "'Kyoto Animation'"]


<IPython.core.display.Javascript object>

In [33]:
# let us create a copy of our dataframe
df1 = df.copy()

<IPython.core.display.Javascript object>

In [34]:
# first we will fill missing values in the columns by 'Others'
df1.studios.fillna("'Others'", inplace=True)
df1.studios.isnull().sum()

0

<IPython.core.display.Javascript object>

* We will now assign the studio names to the entries.
* We will also create a new variable that will show if collaboration between studios was involved for creating an anime.

In [35]:
studio_val = []

for i in range(df1.shape[0]):  # iterate over all rows in data
    txt = df1.studios.values[i]  # getting the values in studios column
    flag = 0  # flag variable
    for item in studios_list:  # iterate over the list of studios considered
        if item in txt and flag == 0:  # checking if studio name is in the row
            studio_val.append(item)
            flag = 1
    if flag == 0:  # if the row values is different from the list of studios considered
        studio_val.append("'Others'")

# we will strip the leading and trailing ', and assign the values to a column
df1["studio_primary"] = [item.strip("'") for item in studio_val]
df1.tail()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running,studio_primary
12096,Sore Ike! Anpanman: Kirameke! Ice no Kuni no V...,Movie,1.0,,False,is_missing,Princess Vanilla is a princess in a land of ic...,'TMS Entertainment',"'Anthropomorphic', 'Family Friendly', 'Food Pr...",,22.0,1,29,1,2.807,10.0,0.0,TMS Entertainment
12097,Hulaing Babies Petit,TV,12.0,5.0,False,Winter,,'Fukushima Gaina',"'Comedy', 'Chibi', 'Short Episodes', 'Original...",,13.0,10,77,2,2.09,10.0,0.0,Others
12098,Marco & The Galaxy Dragon,OVA,1.0,,False,is_missing,,'Others',"'Action', 'Comedy', 'Based on a Video Game'",,17.0,0,65,0,2.543,10.0,0.0,Others
12099,Xing Chen Bian 2nd Season,Web,3.0,24.0,True,is_missing,Second season of Xing Chen Bian.,'Others',"'Action', 'Chinese Animation', 'Based on a Nov...",,,31,22,0,3.941,10.0,0.0,Others
12100,Ultra B: Black Hole kara no Dokusaisha BB!!,Movie,1.0,20.0,False,is_missing,,'Shin-Ei Animation',"'Comedy', 'Sci Fi', 'Superpowers', 'Based on a...",,15.0,1,19,1,2.925,10.0,0.0,Shin-Ei Animation


<IPython.core.display.Javascript object>

In [36]:
# we will create a list defining whether there is a collaboration between studios
# we will check if the second split has None values, which will mean no collaboration between studios
studio_val2 = [
    0 if item is None else 1
    for item in df1.studios.str.split(", ", expand=True).iloc[:, 1]
]

df1["studios_colab"] = studio_val2
df1.tail()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,studios,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running,studio_primary,studios_colab
12096,Sore Ike! Anpanman: Kirameke! Ice no Kuni no V...,Movie,1.0,,False,is_missing,Princess Vanilla is a princess in a land of ic...,'TMS Entertainment',"'Anthropomorphic', 'Family Friendly', 'Food Pr...",,22.0,1,29,1,2.807,10.0,0.0,TMS Entertainment,0
12097,Hulaing Babies Petit,TV,12.0,5.0,False,Winter,,'Fukushima Gaina',"'Comedy', 'Chibi', 'Short Episodes', 'Original...",,13.0,10,77,2,2.09,10.0,0.0,Others,0
12098,Marco & The Galaxy Dragon,OVA,1.0,,False,is_missing,,'Others',"'Action', 'Comedy', 'Based on a Video Game'",,17.0,0,65,0,2.543,10.0,0.0,Others,0
12099,Xing Chen Bian 2nd Season,Web,3.0,24.0,True,is_missing,Second season of Xing Chen Bian.,'Others',"'Action', 'Chinese Animation', 'Based on a Nov...",,,31,22,0,3.941,10.0,0.0,Others,0
12100,Ultra B: Black Hole kara no Dokusaisha BB!!,Movie,1.0,20.0,False,is_missing,,'Shin-Ei Animation',"'Comedy', 'Sci Fi', 'Superpowers', 'Based on a...",,15.0,1,19,1,2.925,10.0,0.0,Shin-Ei Animation,0


<IPython.core.display.Javascript object>

**We will now drop the '*studios*' column.**

In [37]:
df1.drop("studios", axis=1, inplace=True)

<IPython.core.display.Javascript object>

### Treating the *tags* column

In [38]:
df1.sample(
    10, random_state=2
)  # setting the random_state will ensure we get the same results every time

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running,studio_primary,studios_colab
7002,Tales of the Rays: Mirrage Prison,Web,1.0,1.0,False,is_missing,,"'Fantasy', 'Promotional'",,81.0,3,80,1,2.767,38.0,0.0,Others,0
11871,Onikiri Shoujo,Web,1.0,1.0,False,is_missing,,"'Action', 'Black and White', 'Hand to Hand Com...",,44.0,0,25,1,1.359,25.0,0.0,Others,0
7492,Triage X,TV,10.0,,False,Spring,Mochizuki General Hospital boasts some of the ...,"'Action', 'Ecchi', 'Harem', 'Shounen', 'Assass...","'Drug Use', 'Explicit Sex', 'Mature Themes', '...",4129.0,871,2867,788,2.665,3485.0,0.0,XEBEC,0
3852,Rainbow Days OVA,OVA,1.0,,False,is_missing,,"'Romance', 'Shoujo', 'School Life', 'Based on ...",,580.0,24,987,8,3.432,291.0,0.0,Others,0
4506,Endro~!,TV,12.0,,False,Winter,"In the land of Naral Island, a land of magic a...","'Fantasy', 'Slice of Life', 'Magic', 'RPG', 'O...",,1033.0,372,1205,254,3.29,976.0,0.0,Others,0
9863,Heybot!,TV,50.0,,False,Summer,"The story takes place on Screw Island, a screw...","'Comedy', 'Family Friendly', 'Island', 'Non-Hu...",,33.0,14,62,36,2.107,54.0,1.0,Others,0
3513,Unico,Movie,1.0,90.0,False,is_missing,Unico is a special unicorn with the ability to...,"'Adventure', 'Fantasy', 'Animal Protagonists',...",,748.0,7,371,17,3.508,459.0,0.0,MADHOUSE,0
10605,Ali Baba to 40-hiki no Touzoku,Movie,1.0,56.0,False,is_missing,"Generations ago, the wily Ali Baba stole a cav...","'Adventure', 'Family Friendly', 'Middle Easter...",,305.0,7,99,12,1.897,147.0,0.0,Toei Animation,0
10270,rerulili: Girls Talk,Music Video,1.0,4.0,False,is_missing,,,,18.0,0,5,0,1.995,13.0,0.0,Others,0
2942,Majestic Prince Movie: Kakusei no Idenshiko,Movie,1.0,,False,is_missing,,"'Action', 'Drama', 'Mecha', 'Sci Fi', 'Seinen'...",,261.0,11,423,8,3.634,168.0,0.0,Others,1


<IPython.core.display.Javascript object>

* We can see that most of the rows have more than one tag.
* We will split the tags column by ', ' and take all the values in one dataframe for further analysis.

In [39]:
tag_df = pd.DataFrame(
    df1.tags.str.split(", ", expand=True).values.flatten(), columns=["Tags"]
)
val_c = tag_df.Tags.value_counts()
val_c

'Based on a Manga'     3519
'Comedy'               3302
'Action'               2798
'Fantasy'              2197
'Sci Fi'               2012
                       ... 
'Badminton'               1
'Sexual Content'          1
'Ballet Dancing'          1
'Explicit Violence'       1
'Billiards'               1
Name: Tags, Length: 494, dtype: int64

<IPython.core.display.Javascript object>

* There are too many tags in the data, and adding them all as separate columns will make our data dimension very large.
* We will use a threshold, and keep only those tags with at least as many entries as the threshold.

In [40]:
# we take 500 as threshold
threshold = 500
val_c[val_c.values >= threshold]

'Based on a Manga'          3519
'Comedy'                    3302
'Action'                    2798
'Fantasy'                   2197
'Sci Fi'                    2012
'Shounen'                   1753
'Original Work'             1636
'Non-Human Protagonists'    1361
'Drama'                     1284
'Adventure'                 1256
'Family Friendly'           1174
'Short Episodes'            1173
'School Life'               1117
'Romance'                   1115
'Shorts'                    1085
'Slice of Life'              978
'Seinen'                     933
'Supernatural'               858
'Magic'                      778
'Animal Protagonists'        730
'Ecchi'                      695
'Mecha'                      660
'Based on a Light Novel'     646
'CG Animation'               606
'Superpowers'                540
Name: Tags, dtype: int64

<IPython.core.display.Javascript object>

* 500 looks to be a good threshold.
* We will keep only those tags that have more than 500 entries in the data, and the rest we will assign as '*Others*'.
* You can experiment by using a different threshold.

In [41]:
# list of tags
tags_list = val_c[val_c.values >= threshold].index.tolist()
print("Tags taken into consideration:", len(tags_list), tags_list)

Tags taken into consideration: 25 ["'Based on a Manga'", "'Comedy'", "'Action'", "'Fantasy'", "'Sci Fi'", "'Shounen'", "'Original Work'", "'Non-Human Protagonists'", "'Drama'", "'Adventure'", "'Family Friendly'", "'Short Episodes'", "'School Life'", "'Romance'", "'Shorts'", "'Slice of Life'", "'Seinen'", "'Supernatural'", "'Magic'", "'Animal Protagonists'", "'Ecchi'", "'Mecha'", "'Based on a Light Novel'", "'CG Animation'", "'Superpowers'"]


<IPython.core.display.Javascript object>

In [42]:
# let us create a copy of our dataframe
df2 = df1.copy()

<IPython.core.display.Javascript object>

In [43]:
# first we will fill missing values in the columns by 'Others'
df2.tags.fillna("Others", inplace=True)
df2.tags.isnull().sum()

0

<IPython.core.display.Javascript object>

* We will now create a separate dataframe with a column for each tag in our tag list.
* If a particular tag is present in a row, that tags' column will be assigned value 1 else 0.

In [44]:
tags_df = df2.loc[:, ["title", "tags"]].copy()

for item in tags_list:
    tags_df["tag_" + item] = 0

# creating a column to denote tags other than the ones in the list
tags_df["tag_Others"] = 0

tags_df.head()

Unnamed: 0,title,tags,tag_'Based on a Manga',tag_'Comedy',tag_'Action',tag_'Fantasy',tag_'Sci Fi',tag_'Shounen',tag_'Original Work',tag_'Non-Human Protagonists',tag_'Drama',tag_'Adventure',tag_'Family Friendly',tag_'Short Episodes',tag_'School Life',tag_'Romance',tag_'Shorts',tag_'Slice of Life',tag_'Seinen',tag_'Supernatural',tag_'Magic',tag_'Animal Protagonists',tag_'Ecchi',tag_'Mecha',tag_'Based on a Light Novel',tag_'CG Animation',tag_'Superpowers',tag_Others
0,Fullmetal Alchemist: Brotherhood,"'Action', 'Adventure', 'Drama', 'Fantasy', 'My...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,your name.,"'Drama', 'Romance', 'Body Swapping', 'Gender B...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,A Silent Voice,"'Drama', 'Shounen', 'Disability', 'Melancholy'...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,"'Shounen', 'Sports', 'Animeism', 'School Club'...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Attack on Titan 3rd Season: Part II,"'Action', 'Fantasy', 'Horror', 'Shounen', 'Dar...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [45]:
tags_df.shape

(12101, 28)

<IPython.core.display.Javascript object>

In [46]:
for i in range(tags_df.shape[0]):  # iterate over all rows in data
    txt = tags_df.tags.values[i]  # getting the values in tags column
    flag = 0  # flag variable
    for item in tags_list:  # iterate over the list of tags considered
        if item in txt:  # checking if tag is in the row
            tags_df.loc[i, "tag_" + item] = 1
            flag = 1
    if flag == 0:  # if the row values is different from the list of tags considered
        tags_df.loc[i, "tag_Others"] = 1

tags_df.head()

Unnamed: 0,title,tags,tag_'Based on a Manga',tag_'Comedy',tag_'Action',tag_'Fantasy',tag_'Sci Fi',tag_'Shounen',tag_'Original Work',tag_'Non-Human Protagonists',tag_'Drama',tag_'Adventure',tag_'Family Friendly',tag_'Short Episodes',tag_'School Life',tag_'Romance',tag_'Shorts',tag_'Slice of Life',tag_'Seinen',tag_'Supernatural',tag_'Magic',tag_'Animal Protagonists',tag_'Ecchi',tag_'Mecha',tag_'Based on a Light Novel',tag_'CG Animation',tag_'Superpowers',tag_Others
0,Fullmetal Alchemist: Brotherhood,"'Action', 'Adventure', 'Drama', 'Fantasy', 'My...",1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,your name.,"'Drama', 'Romance', 'Body Swapping', 'Gender B...",0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
2,A Silent Voice,"'Drama', 'Shounen', 'Disability', 'Melancholy'...",1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,"'Shounen', 'Sports', 'Animeism', 'School Club'...",1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Attack on Titan 3rd Season: Part II,"'Action', 'Fantasy', 'Horror', 'Shounen', 'Dar...",1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [47]:
# concatenating the tags dataframe (except the tags and title columns) to the original data
df2 = pd.concat([df2, tags_df.iloc[:, 2:]], axis=1)
df2.head()

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,tags,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running,studio_primary,studios_colab,tag_'Based on a Manga',tag_'Comedy',tag_'Action',tag_'Fantasy',tag_'Sci Fi',tag_'Shounen',tag_'Original Work',tag_'Non-Human Protagonists',tag_'Drama',tag_'Adventure',tag_'Family Friendly',tag_'Short Episodes',tag_'School Life',tag_'Romance',tag_'Shorts',tag_'Slice of Life',tag_'Seinen',tag_'Supernatural',tag_'Magic',tag_'Animal Protagonists',tag_'Ecchi',tag_'Mecha',tag_'Based on a Light Novel',tag_'CG Animation',tag_'Superpowers',tag_Others
0,Fullmetal Alchemist: Brotherhood,TV,64.0,,False,Spring,The foundation of alchemy is based on the law ...,"'Action', 'Adventure', 'Drama', 'Fantasy', 'My...","'Animal Abuse', 'Mature Themes', 'Violence', '...",103707.0,14351,25810,2656,4.702,86547.0,1.0,Bones,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,your name.,Movie,1.0,107.0,False,is_missing,Mitsuha and Taki are two total strangers livin...,"'Drama', 'Romance', 'Body Swapping', 'Gender B...",,58831.0,1453,21733,124,4.663,43960.0,0.0,Others,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
2,A Silent Voice,Movie,1.0,130.0,False,is_missing,"After transferring into a new school, a deaf g...","'Drama', 'Shounen', 'Disability', 'Melancholy'...","'Bullying', 'Mature Themes', 'Suicide'",45892.0,946,17148,132,4.661,33752.0,0.0,Kyoto Animation,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,TV,10.0,,False,Fall,"Picking up where the second season ended, the ...","'Shounen', 'Sports', 'Animeism', 'School Club'...",,25134.0,2183,8082,167,4.66,17422.0,0.0,Production I.G,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Attack on Titan 3rd Season: Part II,TV,10.0,,False,Spring,The battle to retake Wall Maria begins now! Wi...,"'Action', 'Fantasy', 'Horror', 'Shounen', 'Dar...","'Cannibalism', 'Explicit Violence'",21308.0,3217,7864,174,4.65,15789.0,0.0,Others,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

**We will now drop the *tags* column.**

In [48]:
df2.drop("tags", axis=1, inplace=True)
df2.shape

(12101, 43)

<IPython.core.display.Javascript object>

### Treating the *contentWarn* column

In [49]:
df2.sample(
    10, random_state=2
)  # setting the random_state will ensure we get the same results every time

Unnamed: 0,title,mediaType,eps,duration,ongoing,sznOfRelease,description,contentWarn,watched,watching,wantWatch,dropped,rating,votes,years_running,studio_primary,studios_colab,tag_'Based on a Manga',tag_'Comedy',tag_'Action',tag_'Fantasy',tag_'Sci Fi',tag_'Shounen',tag_'Original Work',tag_'Non-Human Protagonists',tag_'Drama',tag_'Adventure',tag_'Family Friendly',tag_'Short Episodes',tag_'School Life',tag_'Romance',tag_'Shorts',tag_'Slice of Life',tag_'Seinen',tag_'Supernatural',tag_'Magic',tag_'Animal Protagonists',tag_'Ecchi',tag_'Mecha',tag_'Based on a Light Novel',tag_'CG Animation',tag_'Superpowers',tag_Others
7002,Tales of the Rays: Mirrage Prison,Web,1.0,1.0,False,is_missing,,,81.0,3,80,1,2.767,38.0,0.0,Others,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11871,Onikiri Shoujo,Web,1.0,1.0,False,is_missing,,,44.0,0,25,1,1.359,25.0,0.0,Others,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7492,Triage X,TV,10.0,,False,Spring,Mochizuki General Hospital boasts some of the ...,"'Drug Use', 'Explicit Sex', 'Mature Themes', '...",4129.0,871,2867,788,2.665,3485.0,0.0,XEBEC,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3852,Rainbow Days OVA,OVA,1.0,,False,is_missing,,,580.0,24,987,8,3.432,291.0,0.0,Others,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
4506,Endro~!,TV,12.0,,False,Winter,"In the land of Naral Island, a land of magic a...",,1033.0,372,1205,254,3.29,976.0,0.0,Others,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
9863,Heybot!,TV,50.0,,False,Summer,"The story takes place on Screw Island, a screw...",,33.0,14,62,36,2.107,54.0,1.0,Others,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3513,Unico,Movie,1.0,90.0,False,is_missing,Unico is a special unicorn with the ability to...,,748.0,7,371,17,3.508,459.0,0.0,MADHOUSE,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
10605,Ali Baba to 40-hiki no Touzoku,Movie,1.0,56.0,False,is_missing,"Generations ago, the wily Ali Baba stole a cav...",,305.0,7,99,12,1.897,147.0,0.0,Toei Animation,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10270,rerulili: Girls Talk,Music Video,1.0,4.0,False,is_missing,,,18.0,0,5,0,1.995,13.0,0.0,Others,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2942,Majestic Prince Movie: Kakusei no Idenshiko,Movie,1.0,,False,is_missing,,,261.0,11,423,8,3.634,168.0,0.0,Others,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


<IPython.core.display.Javascript object>

* We can see that some of the rows have no content warning, while some have multiple warnings.
* We will split the *contentWarn* column by ', ' and take all the values in one dataframe for further analysis.

In [50]:
cw_df = pd.DataFrame(
    df2.contentWarn.str.split(", ", expand=True).values.flatten(), columns=["CW"]
)
val_c = cw_df.CW.value_counts()
val_c

'Violence'             524
'Nudity'               397
'Mature Themes'        392
'Explicit Violence'    262
'Physical Abuse'       194
'Sexual Content'       171
'Explicit Sex'         136
'Sexual Abuse'          85
'Animal Abuse'          62
'Suicide'               60
'Emotional Abuse'       51
'Bullying'              51
'Drug Use'              34
'Domestic Abuse'        25
'Cannibalism'           23
'Incest'                21
'Self-Harm'              6
'Prostitution'           5
Name: CW, dtype: int64

<IPython.core.display.Javascript object>

* We see that most of the content warnings are 18+ restrictions, indicating that the content is not suitable for children.
* We will not create separate columns for values of this column as it will increase the dimensions of our data further.
* We will fill the missing values by 0, and the rest of the values by 1.
* You can experiment by creating different columns for the different types of content warnings.

In [51]:
df2["contentWarn"].fillna(0, inplace=True)
df2["contentWarn"] = [1 if item != 0 else 0 for item in df2.contentWarn.values]

df2["contentWarn"].value_counts()

0    10705
1     1396
Name: contentWarn, dtype: int64

<IPython.core.display.Javascript object>

### We have preprocessed the columns with a list of values. We now have the same clean data with which we started the previous session.

**The only change is that we have replaced the '*is_missing*' category in *studio_primary* and *tag* columns by '*Others*'.**

In [2]:
import yaml

In [3]:
# Specify the file path
file_path = 'DataIngestion.yaml'

In [27]:
yamlfile= {
    'columns': data.columns.to_list(),
    'number of columns': data.shape[1],
    'number of rows': data.shape[0],
    'file path':'DataIngestion.yaml'
}

<IPython.core.display.Javascript object>

In [28]:
# Write data to the YAML file
with open(file_path, 'w') as file:
    yaml.dump(yamlfile, file)

<IPython.core.display.Javascript object>

In [29]:
print(f"YAML file '{file_path}' created successfully.")

YAML file 'DataIngestion.yaml' created successfully.


<IPython.core.display.Javascript object>

In [30]:
# Read data from the YAML file
with open(file_path, 'r') as file:
    data_list = yaml.safe_load(file)

# Convert the list of dictionaries back to a DataFrame
df_loaded = pd.DataFrame(data_list)

# Display the loaded DataFrame
print("Data loaded from YAML file:")
print(df_loaded)

Data loaded from YAML file:
         columns           file path  number of columns  number of rows
0          title  DataIngestion.yaml                 18           14578
1      mediaType  DataIngestion.yaml                 18           14578
2            eps  DataIngestion.yaml                 18           14578
3       duration  DataIngestion.yaml                 18           14578
4        ongoing  DataIngestion.yaml                 18           14578
5        startYr  DataIngestion.yaml                 18           14578
6       finishYr  DataIngestion.yaml                 18           14578
7   sznOfRelease  DataIngestion.yaml                 18           14578
8    description  DataIngestion.yaml                 18           14578
9        studios  DataIngestion.yaml                 18           14578
10          tags  DataIngestion.yaml                 18           14578
11   contentWarn  DataIngestion.yaml                 18           14578
12       watched  DataIngestion.yaml

<IPython.core.display.Javascript object>

# Data Validation

In [35]:
expect_col = data_list["columns"]
expect_col

['title',
 'mediaType',
 'eps',
 'duration',
 'ongoing',
 'startYr',
 'finishYr',
 'sznOfRelease',
 'description',
 'studios',
 'tags',
 'contentWarn',
 'watched',
 'watching',
 'wantWatch',
 'dropped',
 'rating',
 'votes']

<IPython.core.display.Javascript object>

In [36]:
expect_col == data.columns.to_list()

True

<IPython.core.display.Javascript object>

In [39]:
columns_num = data_list["number of columns"]
columns_num

18

<IPython.core.display.Javascript object>

In [40]:
columns_num == data.shape[1]

True

<IPython.core.display.Javascript object>

In [41]:
rows_num = data_list["number of rows"]
rows_num

14578

<IPython.core.display.Javascript object>

In [42]:
rows_num == data.shape[0]

True

<IPython.core.display.Javascript object>

The yaml file is correct and accepted