# Develop notebook

In this notebook, we develop our ML project. This is the initial phase, before migrating the code into Python modules.

## ML pipeline

We want to build an ML model that can classify a pokemon if it is a legendary pokemon or not.

## Steps:

- Load data and do checks
- Preproces data to obtain a clean dataset
- Build features for the ML model
- Make hyperparameter selection
- Train
- Predict

## 1. Imports

In [62]:
import pandas as pd
import numpy as np
from pathlib import Path

In [63]:
dataset_folder = Path().cwd().parent / "data" / "datasets"

## 2. Load data

In [64]:
df_raw_pokemon_index = pd.read_csv(dataset_folder / "raw_pokemon_index.csv", sep=";")
df_raw_pokemon_data = pd.read_csv(dataset_folder / "raw_pokemon_data.csv", sep=";")

### Check pokemon index

In [65]:
df_raw_pokemon_index.head(3)

Unnamed: 0,pokedex_number,name
0,1,Bulbasaur
1,2,Ivysaur
2,3,Venusaur


In [66]:
assert df_raw_pokemon_index.pokedex_number.is_unique
assert not df_raw_pokemon_index.pokedex_number.isnull().any()
assert df_raw_pokemon_index.name.is_unique
assert not df_raw_pokemon_index.name.isnull().any()

### Check data

In [67]:
df_raw_pokemon_data.drop_duplicates().shape[0] 

801

In [68]:
df_raw_pokemon_data.shape[0] 

921

In [69]:
df_raw_pokemon_data.drop_duplicates().describe()

Unnamed: 0,pokedex_number,is_legendary,against_electric,hp,experience_growth,against_normal,attack,against_dragon,against_water,against_fight,...,against_rock,against_fire,sp_defense,against_fairy,sp_attack,height_m,against_ghost,percentage_male,weight_kg,against_ice
count,801.0,801.0,708.0,725.0,725.0,729.0,705.0,726.0,715.0,727.0,...,693.0,723.0,718.0,725.0,728.0,695.0,724.0,647.0,685.0,722.0
mean,401.0,0.087391,1.079449,68.769655,1056019.0,0.882716,77.914894,0.969697,1.067133,1.074622,...,1.258297,1.128976,70.711699,1.068621,71.142857,1.156978,0.983425,55.151777,60.775036,1.208449
std,231.373075,0.282583,0.661204,26.316091,157748.8,0.271967,32.078625,0.350041,0.6104,0.708337,...,0.704563,0.68979,27.976961,0.531824,32.182063,1.064805,0.545066,20.636648,109.2033,0.732165
min,1.0,0.0,0.0,1.0,600000.0,0.0,5.0,0.0,0.25,0.0,...,0.25,0.25,20.0,0.25,10.0,0.1,0.0,0.0,0.1,0.25
25%,201.0,0.0,0.5,50.0,1000000.0,1.0,55.0,1.0,0.5,0.5,...,1.0,0.5,50.0,1.0,45.0,0.55,1.0,50.0,9.0,0.5
50%,401.0,0.0,1.0,65.0,1000000.0,1.0,75.0,1.0,1.0,1.0,...,1.0,1.0,65.5,1.0,65.0,1.0,1.0,50.0,26.5,1.0
75%,601.0,0.0,1.0,80.0,1059860.0,1.0,98.0,1.0,1.0,2.0,...,2.0,2.0,89.0,1.0,91.0,1.5,1.0,50.0,62.0,2.0
max,801.0,1.0,4.0,255.0,1640000.0,1.0,185.0,2.0,4.0,4.0,...,4.0,4.0,230.0,4.0,194.0,14.5,4.0,100.0,999.9,4.0


In [70]:
(
    df_raw_pokemon_data.drop_duplicates()
    .isnull()
    .sum(axis=0)
    .to_frame("nr_nulls")
    
)

Unnamed: 0,nr_nulls
name,0
pokedex_number,0
is_legendary,0
against_electric,93
hp,76
experience_growth,76
against_normal,72
capture_rate,72
attack,96
against_dragon,75


In [71]:
assert df_raw_pokemon_data.drop_duplicates().pokedex_number.is_unique

## 3. Clean data

For cleaning, we do the following:

- Drop duplicates
- put all strings to lower
- remove japanese name to avoid encoding issues
- Inner join with the index
- convert abilities to lookup table
- save
    - cleaned table
    - lookup abilities

In [266]:
def remove_duplicates(dataframe: pd.DataFrame):
    """
    Drop the duplicates of the dataframe
    """
    return dataframe.drop_duplicates()


def identify_string_columns(dataframe):
    """
    Identify the list of columns that are not numerical
    """
    return dataframe.select_dtypes(exclude=[np.number]).columns.tolist()



def lower_case(dataframe):
    """
    Convert all string columns to lower case
    """
    for col in identify_string_columns(dataframe=dataframe):
        dataframe.loc[:, col] = dataframe.loc[:, col].str.lower()
    return dataframe


def drop_japanese_names(dataframe):
    """
    Remove japanese name to avoid issues
    """
    return (
        dataframe
        .drop("japanese_name", axis=1)
    )


def clean_classification(dataframe):
    """
    Remove the suffix 'pkèmon' from column values
    """
    return (
        dataframe
        .assign(
            classfication=lambda df: df.classfication.str.replace(" pokémon", "")
        )
    )

def filter_by_valid_pokedex_number(dataframe, pokedex):
    """
    Inner join with pokedex number to identify valid pokemons
    """
    return (
        dataframe
        .merge(
            pokedex[["pokedex_number"]],
            on="pokedex_number",
            how="inner",
            validate="one_to_one"
        )
    )


def extract_abilities_lookup(dataframe):
    """
    Create dataframe of abilities lookup
    """
    return (
        dataframe[["pokedex_number", "abilities"]]
        [lambda df: ~df.abilities.isnull()]
        .assign(
            abilities=lambda df: df.abilities.str.replace("\'|\[|\]", "", regex=True).str.split(", ")
        )
        .explode("abilities")
        [["pokedex_number", "abilities"]] 
        .drop_duplicates()
    )


def drop_abilities(dataframe):
    """
    Drop the abilities dataframe
    """
    return dataframe.drop("abilities", axis=1)

### Apply all transformations at once

In [267]:
intermediate_pokemon_data = (
    df_raw_pokemon_data
    .pipe(remove_duplicates)
    .pipe(lower_case)
    .pipe(clean_classification)
    .pipe(drop_japanese_names)
    .pipe(filter_by_valid_pokedex_number, pokedex=df_raw_pokemon_index)
)

In [268]:
intermediate_pokemon_data[identify_string_columns(intermediate_pokemon_data)]

Unnamed: 0,name,capture_rate,classfication,type2,abilities,type1
0,bulbasaur,,seed,poison,"['overgrow', 'chlorophyll']",grass
1,ivysaur,45,seed,poison,"['overgrow', 'chlorophyll']",grass
2,venusaur,45,seed,poison,"['overgrow', 'chlorophyll']",grass
3,charmander,45,,,"['blaze', 'solar power']",fire
4,charmeleon,45,flame,,"['blaze', 'solar power']",fire
...,...,...,...,...,...,...
796,celesteela,25,launch,,['beast boost'],
797,kartana,255,drawn sword,steel,['beast boost'],grass
798,guzzlord,15,junkivore,dragon,['beast boost'],dark
799,necrozma,3,prism,,['prism armor'],psychic


### Generate dataframe of abilities

In [269]:
df_abilities = extract_abilities_lookup(intermediate_pokemon_data)

### Generate clean data

In [270]:
df_data = drop_abilities(intermediate_pokemon_data)

### Clean index

In [271]:
df_index = lower_case(df_raw_pokemon_index)

### Save tables

In [272]:
df_abilities.to_csv(dataset_folder / "stg__abilities.csv", sep=";", index=False)
df_data.to_csv(dataset_folder / "stg__pokemon_data.csv", sep=";", index=False)
df_index.to_csv(dataset_folder / "stg__pokedex.csv", sep=";", index=False)

## 4. Feature Engineering

For feature engineering, we do the following steps:

- Select major numerical features
    - fill nans with mean
    - standardize
- Select categorical features
    - fill nans with additional value
    - create onehot encoding

In [273]:
df_abilities_loaded = pd.read_csv(dataset_folder / "stg__abilities.csv", sep=";")
df_data_loaded = pd.read_csv(dataset_folder / "stg__pokemon_data.csv", sep=";")
df_index_loaded = pd.read_csv(dataset_folder / "stg__pokedex.csv", sep=";")

### Numerical features

In [274]:
def fill_numerical_nan(dataframe, col):
    """
    For a given column, fill numerical with the mean
    """
    mean = dataframe.loc[:, col].mean()
    
    dataframe.loc[:, col] = np.where(
        ~dataframe.loc[:, col].isnull(),
        dataframe.loc[:, col],
        mean
    )
    
    return dataframe


def fill_numerical_features(dataframe, columns):
    """
    Fill numerical features
    """
    for col in columns:
        dataframe = fill_numerical_nan(dataframe, col)
    
    return dataframe


def standardize_numerical_features(dataframe, columns):
    """
    stndardize according to mean and std
    """
    return (
        (dataframe-dataframe.mean())/dataframe.std()
    )


def calculate_numerical_features(dataframe, features):
    """
    Calculate numerical features
    """
    return (
        dataframe
        [features]
        .pipe(fill_numerical_features, columns=features)
        .pipe(standardize_numerical_features, columns=features)   
    )

In [275]:
features = [
    'hp',
    'experience_growth',
    'attack',
    'base_total',
    'defense',
    'generation',
    'base_egg_steps',
    'base_happiness',
    'speed',
    'sp_defense',
    'sp_attack',
    'height_m',
    'percentage_male',
    'weight_kg',
]

In [276]:
df_numerical_features = df_data_loaded.pipe(calculate_numerical_features, features=features)

## Categorical features

In [302]:
def fill_categorical_values(dataframe):
    """
    For now, missing values are filled with 'missing_value'
    """
    return (
        dataframe.fillna("missing_value")
    )
    

def convert_categorical_to_onehot(dataframe):
    """
    Convert the column into a pivot table for onehot
    """
    return (
        pd.get_dummies(dataframe).astype("int")
    )
    
    
def calculate_categorical_features(dataframe, features):
    """
    Calculate categorical features
    """
    return (
        dataframe[features]
        .pipe(fill_categorical_values)
        .pipe(convert_categorical_to_onehot)
    )

In [296]:
df_categorical_features = calculate_categorical_features(df_data_loaded, features=["type1", "type2"])

## Join features

In [298]:
df_features = df_numerical_features.join(df_categorical_features)

### Define target variable

In [313]:
df_target = df_data_loaded[["is_legendary"]]

In [314]:
df_features.to_csv(dataset_folder / "clean__features.csv", sep=";", index=False)
df_target.to_csv(dataset_folder / "clean__target.csv", sep=";", index=False)

# Tune and train

In [315]:
df_features_loaded= pd.read_csv(dataset_folder / "clean__features.csv", sep=";")
df_target_loaded= pd.read_csv(dataset_folder / "clean__target.csv", sep=";")

In [359]:
assert df_target_loaded.shape[0] == df_features_loaded.shape[0]

In this module we do the following

- split train and test data
- Optimize hyperparameters
- Train 

### Split train test

In [319]:
from sklearn.model_selection import train_test_split

In [368]:
X_train, X_test, y_train, y_test = train_test_split(
    df_features_loaded,
    df_target_loaded,
    random_state=100,
    test_size=0.2
)

### Define hyperspace

In [370]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [416]:
from hyperopt import fmin, tpe, STATUS_OK, space_eval
from hyperopt import Trials

from functools import partial

In [411]:
hyperspace = {
    "penalty": hp.choice("penalty", ["l1", "l2"]),
    "class_weight": hp.choice("class_weight", ["balanced", None]),
    "max_iter": hp.choice("max_iter", [100, 300, 500, 600, 1000, 1500, 2000]),
    "fit_intercept": hp.choice("fit_intercept", [True, False]),
    "solver": hp.choice("solver", ["saga", "liblinear"]),
}

In [412]:
def objective_function(
    hyperparameters, X, y, 
):
    """
    train single instance
    """
    np.random.seed(100)
    
    # Initialize 
    classifier = LogisticRegression(**hyperparameters)
    
    # get scores via cross validation
    scores = cross_val_score(classifier, X, y.values.reshape(y.shape[0], ), cv=5, scoring='f1')
    
    # return value
    return {
        "loss": -np.mean(scores),
        "hyperparameters": hyperparameters,
        'status': STATUS_OK
    }
    

In [413]:
# initialize trials and 
trials = Trials()


best_result = fmin(
    partial(objective_function, X=X_train, y=y_train),
    space=hyperspace,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials
)

 60%|███████████████████████████████████▍                       | 6/10 [00:00<00:00, 23.57trial/s, best loss: -0.8923038480759621]









 90%|█████████████████████████████████████████████████████      | 9/10 [00:01<00:00,  6.12trial/s, best loss: -0.8923038480759621]




100%|██████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.57trial/s, best loss: -0.8923038480759621]


In [None]:
def train_model(hyperparameters, X, y):
    """
    Train model for given hyperparameters and data
    """
    np.random.seed(100)

    # initialize model
    classifier = LogisticRegression(**hyperparameters)

    # Train
    classifier.fit(X, y)

    return classifier

In [418]:
best_hyperparameters = space_eval(hyperspace, best_result)
best_hyperparameters

{'class_weight': 'balanced',
 'fit_intercept': True,
 'max_iter': 2000,
 'penalty': 'l1',
 'solver': 'liblinear'}

In [419]:
classifier = train_model(best_hyperparameters, X_train, y_train)

  y = column_or_1d(y, warn=True)


### Predict on test data

In [424]:
from sklearn.metrics import confusion_matrix, f1_score

In [425]:
f1_score(y_test, classifier.predict(X_test))

0.7500000000000001

In [432]:
tn, fp, fn, tp = confusion_matrix(y_test, classifier.predict(X_test)).ravel()

In [434]:
print(
    f"True Negative: {tn}",
    f"\nTrue Positive: {tp}",
    f"\nFalse Positive: {fp}",
    f"\nFalse Negative: {fn}",
)

True Negative: 146 
True Positive: 9 
False Positive: 2 
False Negative: 4
