### Tratamento da base

In [None]:
!pip install pandera[io]

In [None]:
import pandas as pd
import numpy as np
import pandera as pa
import re, warnings
warnings.filterwarnings("ignore")

In [None]:
pa.__version__

In [None]:
np.__version__

In [None]:
re.__version__

In [None]:
df = pd.read_csv('anime-dataset-2023.csv')

In [None]:
def tratar_colunas(df) -> pd.DataFrame:
    cols = list(df.columns)
    cols = [col.upper() for col in cols]
    cols = [col.replace(" ","_") for col in cols]
    df.columns = cols
    return df

In [None]:
df = tratar_colunas(df)

In [None]:
df.drop(["OTHER_NAME","ENGLISH_NAME","SYNOPSIS","AIRED","PRODUCERS","LICENSORS","IMAGE_URL"], axis = 1, inplace = True)

# OTHER_NAME: Names in other languages
# ENGLISH_NAME: Many UNKNOWN. Believe it has to be licensed in USA
# SYNOPSIS: Description. Not going to deal with that now
# AIRED: Description. Not going to deal with that now
# PRODUCERS: Way too many. Not going to deal with that now
# LICENSORS: Way too many. Not going to deal with that now
# IMAGE_URL: Not necessary for the analysis

In [None]:
df = df[~df["TYPE"].isin(["UNKNOWN","Music"])]

In [None]:
def tratar_duration(df) -> pd.DataFrame:
    hour_pattern = r'(\d+)\s*hr'
    minute_pattern = r'(\d+)\s*min'
    duration = list(df["DURATION"])
    hour_match = [re.search(hour_pattern, d) for d in duration]
    minute_match = [re.search(minute_pattern, d) for d in duration]
    df["DURATION_HR"] = [int(d.group(1)) if d else np.nan for d in hour_match]
    df["DURATION_MIN"] = [int(d.group(1)) if d else np.nan for d in minute_match]
    df[["DURATION_HR","DURATION_MIN"]] = df[["DURATION_HR","DURATION_MIN"]].fillna(0)
    df["DURATION_HR"] = df["DURATION_HR"]*60
    df["DURATION_FINAL"] = df["DURATION_HR"] + df["DURATION_MIN"]
    df.drop(["DURATION","DURATION_HR","DURATION_MIN"], axis = 1, inplace = True)
    return df

In [None]:
def tratar_UNKNOWN(df) -> pd.DataFrame:
    df["SCORE"] = df["SCORE"].str.replace("UNKNOWN","0")
    df["RANK"] = df["RANK"].str.replace("UNKNOWN","0")
    df["SCORED_BY"] = df["SCORED_BY"].str.replace("UNKNOWN","0")
    
    """
    Not yet aired and Currently airing will not have number of episodes
    """
    df["EPISODES"] = df["EPISODES"].str.replace("UNKNOWN","-1")
    return df

In [None]:
def tratar_premiered(df) -> pd.DataFrame:
    df[['PREMIERED_SEASON','PREMIERED_YEAR']] = df['PREMIERED'].str.split(n=1, expand=True)
    df.drop(["PREMIERED"], axis = 1, inplace = True)
    return df

In [None]:
def tratar_tipo(df) -> pd.DataFrame:
    df[["SCORE","RANK","EPISODES","SCORED_BY"]] = df[["SCORE","RANK","EPISODES","SCORED_BY"]].astype(float)
    df["PREMIERED_YEAR"] = pd.to_numeric(df['PREMIERED_YEAR'], errors='coerce')
    return df

In [None]:
def tratar_genre(df) -> pd.DataFrame:
    df["GENRES"] = df["GENRES"].str.replace(" ","")
    genres_dummies = df['GENRES'].str.get_dummies(',')
    df = pd.concat([df, genres_dummies], axis=1)
    df.drop(["GENRES","UNKNOWN"], axis = 1, inplace = True)
    return df

In [None]:
df = tratar_duration(df)
df = tratar_UNKNOWN(df)
df = tratar_premiered(df)
df = tratar_tipo(df)
df = tratar_genre(df)

In [None]:
df.head()

In [None]:
df.to_csv("anime-dataset-treated-2023.csv", index = False)

### Validação da base

In [None]:
import pandas as pd
import pandera as pa
from validation_schema import test, schema

In [None]:
df = pd.read_csv("anime-dataset-treated-2023.csv")

In [None]:
schema = pa.infer_schema(df)
print(schema)

In [None]:
schema_script = schema.to_script()

In [None]:
df[df.duplicated("ANIME_ID")]

In [None]:
df.iloc[:5,0:10]

In [None]:
df["PREMIERED_SEASON"].unique()

In [None]:
from pandera import DataFrameSchema, Column, Check, Index, MultiIndex

categorias_type = ['TV', 'Movie', 'OVA', 'Special', 'ONA']
categorias_status = ['Finished Airing', 'Currently Airing', 'Not yet aired']
categorias_season = ['spring', 'summer', 'fall', 'winter', 'UNKNOWN']

schema = DataFrameSchema(
    columns={
        "ANIME_ID": Column(
            dtype="int64",
            nullable=False,
            unique=True,
            required=True,
        ),
        "NAME": Column(
            dtype="object",
            nullable=False,
            required=True,
        ),
        "SCORE": Column(
            dtype="float64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=10.0),
            ],
            nullable=False,
            unique=False,
            required=True,
        ),
        "TYPE": Column(
            dtype="object",
            checks=pa.Check.isin(categorias_type),
            nullable=False,
            required=True,
        ),
        "EPISODES": Column(
            dtype="float64",
            nullable=False,
            required=True,
        ),
        "STATUS": Column(
            dtype="object",
            checks=pa.Check.isin(categorias_status),
            nullable=False,
            required=True,
        ),
        "STUDIOS": Column(
            dtype="object",
            nullable=False,
            required=True,
        ),
        "SOURCE": Column(
            dtype="object",
            nullable=False,
            required=True,
        ),
        "RATING": Column(
            dtype="object",
            nullable=False,
            required=True,
        ),
        "RANK": Column(
            dtype="float64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "POPULARITY": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "FAVORITES": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "SCORED_BY": Column(
            dtype="float64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "MEMBERS": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "DURATION_FINAL": Column(
            dtype="float64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
            ],
            nullable=False,
            required=True,
        ),
        "PREMIERED_SEASON": Column(
            dtype="object",
            checks=pa.Check.isin(categorias_season),
            nullable=False,
            required=True,
        ),
        "PREMIERED_YEAR": Column(
            dtype="float64",
            checks=[
                Check.greater_than_or_equal_to(min_value=1950.0),
                Check.less_than_or_equal_to(max_value=2025.0),
            ],
            nullable=True,
            required=True,
        ),
        "Action": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Adventure": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "AvantGarde": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "AwardWinning": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "BoysLove": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Comedy": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Drama": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Ecchi": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Erotica": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Fantasy": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "GirlsLove": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Gourmet": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Hentai": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Horror": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Mystery": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Romance": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Sci-Fi": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "SliceofLife": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Sports": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Supernatural": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
        "Suspense": Column(
            dtype="int64",
            checks=[
                Check.greater_than_or_equal_to(min_value=0.0),
                Check.less_than_or_equal_to(max_value=1.0),
            ],
            nullable=False,
            required=True,
        ),
    },
    strict=True,
    ordered=True,
    unique_column_names=True,
)

In [None]:
try:
    schema.validate(df)
except pa.errors.SchemaError as exc:
    print(exc)