In [1]:
import pandas as pd

In [2]:
pokemon = pd.read_csv('./data/Pokemon.csv')
print(pokemon.shape)
print(pokemon.columns)
pokemon.head()

(800, 13)
Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
pokemon_modeling = pokemon.drop(columns=["#", "Name"])
pokemon_modeling.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Grass,Poison,525,80,82,83,100,100,80,1,False
3,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Fire,,309,39,52,43,60,50,65,1,False


In [5]:
(
    pokemon_modeling
    .isnull()
    .sum()
    .rename("Freq")
    .reset_index()
    .assign(
        percentage = lambda x: (x['Freq']/pokemon_modeling.shape[0])* 100
    )
)

Unnamed: 0,index,Freq,percentage
0,Type 1,0,0.0
1,Type 2,386,48.25
2,Total,0,0.0
3,HP,0,0.0
4,Attack,0,0.0
5,Defense,0,0.0
6,Sp. Atk,0,0.0
7,Sp. Def,0,0.0
8,Speed,0,0.0
9,Generation,0,0.0


In [6]:
del pokemon_modeling['Type 2']

In [7]:
pokemon_modeling['Legendary'].value_counts() # Class Imbalance problem:

False    735
True      65
Name: Legendary, dtype: int64

In [8]:
import numpy as np
np.random.seed(1024)

In [9]:
random_indices = np.random.randint(low=0, high=pokemon_modeling.shape[0], size=335)
pokemon_modeling['Legendary'].iloc[random_indices] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [10]:
pokemon_modeling['Legendary'].value_counts()

False    491
True     309
Name: Legendary, dtype: int64

In [11]:
pokemon_modeling

Unnamed: 0,Type 1,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,318,45,49,49,65,65,45,1,False
1,Grass,405,60,62,63,80,80,60,1,False
2,Grass,525,80,82,83,100,100,80,1,False
3,Grass,625,80,100,123,122,120,80,1,False
4,Fire,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...
795,Rock,600,50,100,150,100,150,50,6,True
796,Rock,700,50,160,110,160,110,110,6,True
797,Psychic,600,80,110,60,150,130,70,6,True
798,Psychic,680,80,160,60,170,130,80,6,True


In [12]:
pokemon_modeling['Type 1'].unique()

array(['Grass', 'Fire', 'Water', 'Bug', 'Normal', 'Poison', 'Electric',
       'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Ghost', 'Ice',
       'Dragon', 'Dark', 'Steel', 'Flying'], dtype=object)

In [13]:
# 1. Label encoding
# 2. One-Hot encoding - Dummy variable creation

In [14]:
pokemon_type_dummies = pd.get_dummies(pokemon_modeling['Type 1'])

In [15]:
pokemon_type_dummies

Unnamed: 0,Bug,Dark,Dragon,Electric,Fairy,Fighting,Fire,Flying,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
797,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
798,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [16]:
pokemon_modeling.head()

Unnamed: 0,Type 1,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,318,45,49,49,65,65,45,1,False
1,Grass,405,60,62,63,80,80,60,1,False
2,Grass,525,80,82,83,100,100,80,1,False
3,Grass,625,80,100,123,122,120,80,1,False
4,Fire,309,39,52,43,60,50,65,1,False


In [17]:
pokemon_modeling = pd.concat([pokemon_modeling, pokemon_type_dummies], axis=1)
pokemon_modeling.head()

Unnamed: 0,Type 1,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,Grass,318,45,49,49,65,65,45,1,False,...,0,1,0,0,0,0,0,0,0,0
1,Grass,405,60,62,63,80,80,60,1,False,...,0,1,0,0,0,0,0,0,0,0
2,Grass,525,80,82,83,100,100,80,1,False,...,0,1,0,0,0,0,0,0,0,0
3,Grass,625,80,100,123,122,120,80,1,False,...,0,1,0,0,0,0,0,0,0,0
4,Fire,309,39,52,43,60,50,65,1,False,...,0,0,0,0,0,0,0,0,0,0


In [18]:
del pokemon_modeling['Type 1']

In [19]:
pokemon_modeling.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Bug,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,318,45,49,49,65,65,45,1,False,0,...,0,1,0,0,0,0,0,0,0,0
1,405,60,62,63,80,80,60,1,False,0,...,0,1,0,0,0,0,0,0,0,0
2,525,80,82,83,100,100,80,1,False,0,...,0,1,0,0,0,0,0,0,0,0
3,625,80,100,123,122,120,80,1,False,0,...,0,1,0,0,0,0,0,0,0,0
4,309,39,52,43,60,50,65,1,False,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
len(pokemon['Type 1'].unique())

18

In [21]:
len([col_name for col_name in pokemon_modeling.columns if col_name in set(pokemon['Type 1'])])

18

In [22]:
del pokemon_modeling['Steel']

In [23]:
len([col_name for col_name in pokemon_modeling.columns if col_name in set(pokemon['Type 1'])])

17

In [24]:
pokemon_modeling['Generation'].value_counts()

1    166
5    165
3    160
4    121
2    106
6     82
Name: Generation, dtype: int64

In [32]:
pokemon_modeling = pd.get_dummies(
    data=pokemon_modeling,
    prefix='Generation',
    columns=['Generation'],
    drop_first=True
)

In [33]:
pokemon_modeling.columns

Index(['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed',
       'Legendary', 'Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting',
       'Fire', 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
       'Psychic', 'Rock', 'Water', 'Generation_2', 'Generation_3',
       'Generation_4', 'Generation_5', 'Generation_6'],
      dtype='object')

In [34]:
pokemon_modeling.shape

(800, 30)

In [35]:
pokemon_modeling['Legendary'] = pokemon_modeling['Legendary'].astype(int) 

In [36]:
pokemon_modeling['Legendary'].value_counts()

0    491
1    309
Name: Legendary, dtype: int64

In [37]:
from sklearn.model_selection import train_test_split

In [42]:
x = pokemon_modeling[[colname for colname in pokemon_modeling.columns if colname != 'Legendary']]
print(x.columns)
x.head()

Index(['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed',
       'Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
       'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
       'Psychic', 'Rock', 'Water', 'Generation_2', 'Generation_3',
       'Generation_4', 'Generation_5', 'Generation_6'],
      dtype='object')


Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Bug,Dark,Dragon,...,Normal,Poison,Psychic,Rock,Water,Generation_2,Generation_3,Generation_4,Generation_5,Generation_6
0,318,45,49,49,65,65,45,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,405,60,62,63,80,80,60,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,525,80,82,83,100,100,80,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,625,80,100,123,122,120,80,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,309,39,52,43,60,50,65,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
y = pokemon_modeling['Legendary']

In [75]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size=0.3,
    random_state=1024,
    stratify=y
)

In [76]:
x_train.shape, x_test.shape

((560, 29), (240, 29))

In [77]:
from sklearn.neighbors import KNeighborsClassifier

In [78]:
k = 10
model = KNeighborsClassifier(n_neighbors=k)

In [79]:
trained_model = model.fit(x_train, y_train)

In [80]:
from sklearn.metrics import classification_report

In [81]:
y_train_predicted = trained_model.predict(x_train)

In [82]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true=y_train, y_pred=y_train_predicted)

array([[320,  24],
       [149,  67]], dtype=int64)

In [83]:
training_metrics = pd.DataFrame({
    "Actuals": y_train.values,
    "Predicted": y_train_predicted
})

In [84]:
training_metrics.to_excel("pokemon_training_metrics.xlsx")

In [85]:
print(classification_report(y_true=y_train, y_pred=y_train_predicted))

              precision    recall  f1-score   support

           0       0.68      0.93      0.79       344
           1       0.74      0.31      0.44       216

    accuracy                           0.69       560
   macro avg       0.71      0.62      0.61       560
weighted avg       0.70      0.69      0.65       560



In [86]:
y_test_predicted = trained_model.predict(x_test)

In [87]:
confusion_matrix(y_true=y_test, y_pred=y_test_predicted)

array([[135,  12],
       [ 70,  23]], dtype=int64)

In [88]:
print(classification_report(y_true=y_test, y_pred=y_test_predicted))

              precision    recall  f1-score   support

           0       0.66      0.92      0.77       147
           1       0.66      0.25      0.36        93

    accuracy                           0.66       240
   macro avg       0.66      0.58      0.56       240
weighted avg       0.66      0.66      0.61       240

