In [1]:
# Author: Michael Munje

In [2]:
import sqlite3
conn = sqlite3.connect('final.db')
cur = conn.cursor()

In [3]:
def fetch_table_names(cur: sqlite3.Cursor) -> [str]:
    '''
    Retrieves all tables in the database
    '''
    
    cur.execute('SELECT name FROM sqlite_master WHERE type = \'table\';')
    return [x[0] for x in cur.fetchall()]

def fetch_column_names(cur: sqlite3.Cursor, table: str) -> [str]:
    '''
    Retrieves all columns in a table
    '''
    
    cur.execute('PRAGMA table_info(' + table + ');')
    return [x[1] for x in cur.fetchall()]

def fetch_col_values(cur: sqlite3.Cursor, table: str, col: str) -> []:
    '''
    Returns all values for a specific column
    '''
    
    cur.execute('SELECT ' + col + ' FROM ' + table + ';')
    return [x[0] for x in cur.fetchall()]

In [4]:
# Review:
# Database contains tables
# You can think of these tables as a database version of a dataframe
# Therefore, tables contain columns and rows (each entry)

In [5]:
# Retrieves all possible tables in the database
fetch_table_names(cur)

['SCHOOL_TRAIN', 'SCHOOL_TEST', 'POKEMON_TRAIN', 'POKEMON_TEST']

In [6]:
# Retrieves all possible columns in a table
fetch_column_names(cur, 'POKEMON_TRAIN')

['rowid',
 'unique_id',
 'types',
 'abilities',
 'base_happiness',
 'height_m',
 'weight_kg',
 'poke_stats',
 'is_legendary']

In [7]:
fetch_column_names(cur, 'POKEMON_TEST')

['rowid',
 'unique_id',
 'types',
 'abilities',
 'base_happiness',
 'height_m',
 'weight_kg',
 'poke_stats']

In [8]:
# Retrieves all values in a column
fetch_col_values(cur, 'POKEMON_TRAIN', 'poke_stats')[:1]

['nan, 80.0, 55.0, 25.0, 35.0, 35.0']

In [9]:
# Note: You can fetch all the values at once with SELECT * FROM TABLE_NAME;
# but this is NOT a good idea if you have a ton of data
# And since I have 1500 columns, I will not do such

In [10]:
import pandas as pd
train = pd.DataFrame()

# You now know how to easily query (fetch) column data from a database
# And also how to find all possible column names
# How can you use this knowledge to construct a dataframe?
# Good luck

In [11]:
train = pd.read_sql_query(
'''select
rowid,
unique_id,
types,
abilities,
base_happiness,
height_m,
weight_kg,
poke_stats,
is_legendary
from POKEMON_TRAIN''', conn)
test = pd.read_sql_query(
'''select
rowid,
unique_id,
types,
abilities,
base_happiness,
height_m,
weight_kg,
poke_stats
from POKEMON_TEST''', conn)

In [12]:
train.head()

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats,is_legendary
0,1,665,"fighting, nan","['Guts', 'Sheer Force', 'Iron Fist']",70.0,0.6,12.5,"nan, 80.0, 55.0, 25.0, 35.0, 35.0",0
1,2,288,"dragon, nan","['Bulletproof', 'Soundproof', 'Overcoat']",70.0,0.6,29.7,"nan, nan, 65.0, 45.0, 45.0, 45.0",0
2,3,349,"poison, fire","['Corrosion', 'Oblivious']",70.0,1.2,22.2,"nan, 64.0, 60.0, 111.0, 60.0, 117.0",0
3,4,260,"ground, nan","['Rock Head', 'Lightningrod', 'Battle Armor']",70.0,0.4,6.5,"nan, 50.0, 95.0, 40.0, 50.0, 35.0",0
4,5,536,"water, nan","['Torrent', 'Sheer Force']",70.0,0.6,9.5,"nan, 65.0, 64.0, 44.0, 48.0, 43.0",0


In [13]:
test.head()

Unnamed: 0,rowid,unique_id,types,abilities,base_happiness,height_m,weight_kg,poke_stats
0,1,602,"grass, nan","['Overgrow', 'Contrary']",70.0,0.6,8.1,"nan, nan, 55.0, 45.0, nan, 63.0"
1,2,756,"fairy, flying","['Hustle', 'Serene Grace', 'Super Luck']",70.0,1.5,38.0,"nan, nan, 95.0, 120.0, 115.0, 80.0"
2,3,191,"electric, nan","['Static', 'Plus']",70.0,1.4,61.5,"nan, 95.0, 105.0, 165.0, nan, 45.0"
3,4,16,"psychic, nan",['Levitate'],,0.2,0.6,"nan, 30.0, 50.0, 65.0, nan, 45.0"
4,5,718,"fighting, psychic","['Pure Power', 'Telepathy']",,1.3,31.5,"nan, 100.0, 85.0, 80.0, 85.0, 100.0"


In [14]:
print(len(train),len(test))

520 281


What to do: 

separate abilities

plot things out-happiness v. legendary, weight v. legendary, height v. legendary, percentage of types legendary, percentage of abiliies legendary

In [15]:
types=train['types']
what=types[4].split(',')
what[1]

' nan'

How to go about it

In [16]:
train['type1']=train['types'].apply(lambda x: x.split(',')[0])
train['type2']=train['types'].apply(lambda x: x.split(',')[1])
train['hp']=train['poke_stats'].apply(lambda x: x.split(',')[0])
train['attack']=train['poke_stats'].apply(lambda x: x.split(',')[1])
train['defense']=train['poke_stats'].apply(lambda x: x.split(',')[2])
train['sp_attack']=train['poke_stats'].apply(lambda x: x.split(',')[3])
train['sp_defense']=train['poke_stats'].apply(lambda x: x.split(',')[4])
train['speed']=train['poke_stats'].apply(lambda x: x.split(',')[5])
test['type1']=test['types'].apply(lambda x: x.split(',')[0])
test['type2']=test['types'].apply(lambda x: x.split(',')[1])
test['hp']=test['poke_stats'].apply(lambda x: x.split(',')[0])
test['attack']=test['poke_stats'].apply(lambda x: x.split(',')[1])
test['defense']=test['poke_stats'].apply(lambda x: x.split(',')[2])
test['sp_attack']=test['poke_stats'].apply(lambda x: x.split(',')[3])
test['sp_defense']=test['poke_stats'].apply(lambda x: x.split(',')[4])
test['speed']=test['poke_stats'].apply(lambda x: x.split(',')[5])

In [17]:
train=train.drop(columns=['types','poke_stats'])
test=test.drop(columns=['types','poke_stats'])

In [18]:
#test.isna().any() checking

In [19]:
train.hp.unique()

array(['nan', '109', '74', '43', '144', '65', '46', '60', '50', '70',
       '100', '106', '35', '58', '75', '45', '48', '79', '80', '66', '55',
       '83', '64', '40', '85', '137', '42', '110', '38', '115', '30',
       '51'], dtype=object)

In [20]:
train.head()

Unnamed: 0,rowid,unique_id,abilities,base_happiness,height_m,weight_kg,is_legendary,type1,type2,hp,attack,defense,sp_attack,sp_defense,speed
0,1,665,"['Guts', 'Sheer Force', 'Iron Fist']",70.0,0.6,12.5,0,fighting,,,80.0,55.0,25.0,35.0,35.0
1,2,288,"['Bulletproof', 'Soundproof', 'Overcoat']",70.0,0.6,29.7,0,dragon,,,,65.0,45.0,45.0,45.0
2,3,349,"['Corrosion', 'Oblivious']",70.0,1.2,22.2,0,poison,fire,,64.0,60.0,111.0,60.0,117.0
3,4,260,"['Rock Head', 'Lightningrod', 'Battle Armor']",70.0,0.4,6.5,0,ground,,,50.0,95.0,40.0,50.0,35.0
4,5,536,"['Torrent', 'Sheer Force']",70.0,0.6,9.5,0,water,,,65.0,64.0,44.0,48.0,43.0


In [21]:
type1_count=train.groupby('type1')['rowid'].nunique()
type2_count=train.groupby('type2')['rowid'].nunique()
print(type1_count)
print(type2_count)

type1
bug         39
dark        20
dragon      20
electric    28
fairy       13
fighting    15
fire        38
flying       1
ghost       17
grass       50
ground      18
ice         16
normal      65
poison      24
psychic     36
rock        29
steel       15
water       76
Name: rowid, dtype: int64
type2
 bug           4
 dark         17
 dragon       11
 electric      5
 fairy        17
 fighting     18
 fire          7
 flying       60
 ghost        11
 grass        13
 ground       26
 ice           7
 nan         256
 normal        2
 poison       26
 psychic      13
 rock          7
 steel        12
 water         8
Name: rowid, dtype: int64


In [22]:
legendary=train[train['is_legendary']=='1']

In [23]:
legendary

Unnamed: 0,rowid,unique_id,abilities,base_happiness,height_m,weight_kg,is_legendary,type1,type2,hp,attack,defense,sp_attack,sp_defense,speed
5,6,293,['Turboblaze'],0.0,3.2,330.0,1,dragon,fire,,120.0,100.0,150.0,120.0,90.0
9,10,140,['Beast Boost'],0.0,1.2,55.5,1,rock,poison,109.0,53.0,47.0,127.0,131.0,103.0
38,39,236,['Natural Cure'],100.0,0.6,5.0,1,psychic,grass,,,100.0,100.0,,100.0
46,47,640,['Bad Dreams'],0.0,1.5,50.5,1,dark,,,90.0,90.0,135.0,90.0,125.0
51,52,563,['Beast Boost'],0.0,3.8,100.0,1,electric,,,,71.0,,71.0,83.0
61,62,66,"['Clear Body', 'Ice Body']",,1.8,175.0,1,ice,,,,,100.0,200.0,50.0
69,70,634,['Drizzle'],0.0,4.5,352.0,1,water,,,150.0,90.0,180.0,160.0,90.0
82,83,102,['Serene Grace'],100.0,0.3,1.1,1,steel,psychic,100.0,100.0,100.0,100.0,100.0,100.0
101,102,42,"['Pressure', 'Regenerator']",,3.8,199.0,1,fire,flying,106.0,,90.0,110.0,,90.0
109,110,458,['Prism Armor'],0.0,2.4,230.0,1,psychic,,,,101.0,127.0,,79.0


In [24]:
len(legendary)

48

In [25]:
for i in range(15):
    tnull=train[train.columns[i]].isnull().sum()
    if tnull>0:
        print(train.columns[i],":",tnull)

base_happiness : 47
height_m : 14
weight_kg : 14


In [26]:
train.base_happiness.unique()

array(['70.0', '0.0', '35.0', '100.0', None, '140.0', '90.0'],
      dtype=object)

In [27]:
sadmon=train[train['base_happiness']=='0.0']
#sadmon is all but 2 legendary

In [28]:
len(sadmon)

24

In [29]:
unknown=train[train['base_happiness'].isnull()]
#unknown is not the same as zero

In [30]:
categorical_features=train.select_dtypes(include=["object"]).columns
numerical_features=train.select_dtypes(exclude=["object"]).columns
print("cat features:",len(categorical_features))
print(categorical_features)
print("num features:",len(numerical_features))
print(numerical_features)

cat features: 14
Index(['unique_id', 'abilities', 'base_happiness', 'height_m', 'weight_kg',
       'is_legendary', 'type1', 'type2', 'hp', 'attack', 'defense',
       'sp_attack', 'sp_defense', 'speed'],
      dtype='object')
num features: 1
Index(['rowid'], dtype='object')


In [31]:
train=train.replace({'hp' : {'nan': None},
                     'attack' : {' nan': None},
                     'defense' : {' nan': None},
                     'sp_attack' : {' nan': None},
                     'sp_defense' : {' nan': None},
                     'speed' : {' nan': None}})
test=test.replace({'base_happiness' : {"michael was here" : None},
                   'height_m' : {' ' : None},
                   'weight_kg' : {' ' : None},
                   'hp' : {'nan': None},
                   'attack' : {' nan': None},
                   'defense' : {' nan': None},
                   'sp_attack' : {' nan': None},
                   'sp_defense' : {' nan': None},
                   'speed' : {' nan': None}})

In [32]:
train.base_happiness=pd.to_numeric(train.base_happiness)
train.height_m=pd.to_numeric(train.height_m)
train.weight_kg=pd.to_numeric(train.weight_kg)
train.is_legendary=pd.to_numeric(train.is_legendary)
train.hp=pd.to_numeric(train.hp)
train.attack=pd.to_numeric(train.attack)
train.defense=pd.to_numeric(train.defense)
train.sp_attack=pd.to_numeric(train.sp_attack)
train.sp_defense=pd.to_numeric(train.sp_defense)
train.speed=pd.to_numeric(train.speed)
test.base_happiness=pd.to_numeric(test.base_happiness)
test.height_m=pd.to_numeric(test.height_m)
test.weight_kg=pd.to_numeric(test.weight_kg)
test.hp=pd.to_numeric(test.hp)
test.attack=pd.to_numeric(test.attack)
test.defense=pd.to_numeric(test.defense)
test.sp_attack=pd.to_numeric(test.sp_attack)
test.sp_defense=pd.to_numeric(test.sp_defense)
test.speed=pd.to_numeric(test.speed)

In [33]:
categorical_features=train.select_dtypes(include=["object"]).columns
numerical_features=train.select_dtypes(exclude=["object"]).columns
print("cat features:",len(categorical_features))
print(categorical_features)
print("num features:",len(numerical_features))
print(numerical_features)

cat features: 4
Index(['unique_id', 'abilities', 'type1', 'type2'], dtype='object')
num features: 11
Index(['rowid', 'base_happiness', 'height_m', 'weight_kg', 'is_legendary',
       'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed'],
      dtype='object')


In [34]:
categorical_features=test.select_dtypes(include=["object"]).columns
numerical_features=test.select_dtypes(exclude=["object"]).columns
print("cat features:",len(categorical_features))
print(categorical_features)
print("num features:",len(numerical_features))
print(numerical_features)

cat features: 4
Index(['unique_id', 'abilities', 'type1', 'type2'], dtype='object')
num features: 10
Index(['rowid', 'base_happiness', 'height_m', 'weight_kg', 'hp', 'attack',
       'defense', 'sp_attack', 'sp_defense', 'speed'],
      dtype='object')


In [35]:
#train.isna().any() checked to see how many were null

In [36]:
train2=train.drop(columns='is_legendary')
test2=test
combo=pd.concat([train2,test2])
print(combo.mean())
print(combo.median())

rowid             218.578027
unique_id                inf
base_happiness     65.411437
height_m            1.163462
weight_kg          61.423462
hp                 63.915663
attack             79.227941
defense            73.301887
sp_attack          71.211405
sp_defense         70.286810
speed              66.192893
dtype: float64
rowid             201.00
unique_id         400.00
base_happiness     70.00
height_m            1.00
weight_kg          27.65
hp                 60.00
attack             75.00
defense            70.00
sp_attack          65.00
sp_defense         65.00
speed              65.00
dtype: float64


In [37]:
train=train.fillna(combo.mean())
test=test.fillna(combo.mean())
train.head()

Unnamed: 0,rowid,unique_id,abilities,base_happiness,height_m,weight_kg,is_legendary,type1,type2,hp,attack,defense,sp_attack,sp_defense,speed
0,1,665,"['Guts', 'Sheer Force', 'Iron Fist']",70.0,0.6,12.5,0,fighting,,63.915663,80.0,55.0,25.0,35.0,35.0
1,2,288,"['Bulletproof', 'Soundproof', 'Overcoat']",70.0,0.6,29.7,0,dragon,,63.915663,79.227941,65.0,45.0,45.0,45.0
2,3,349,"['Corrosion', 'Oblivious']",70.0,1.2,22.2,0,poison,fire,63.915663,64.0,60.0,111.0,60.0,117.0
3,4,260,"['Rock Head', 'Lightningrod', 'Battle Armor']",70.0,0.4,6.5,0,ground,,63.915663,50.0,95.0,40.0,50.0,35.0
4,5,536,"['Torrent', 'Sheer Force']",70.0,0.6,9.5,0,water,,63.915663,65.0,64.0,44.0,48.0,43.0


In [38]:
corrmat=train.corr()
corrmat

Unnamed: 0,rowid,base_happiness,height_m,weight_kg,is_legendary,hp,attack,defense,sp_attack,sp_defense,speed
rowid,1.0,-0.002708,0.018949,0.046309,-0.011286,-0.083683,-0.008515,-0.041466,0.024439,0.00146,0.059613
base_happiness,-0.002708,1.0,-0.295233,-0.375495,-0.431916,-0.066646,-0.196162,-0.206992,-0.191422,-0.22246,-0.203218
height_m,0.018949,-0.295233,1.0,0.556165,0.300195,0.139841,0.346853,0.31232,0.319995,0.306693,0.256357
weight_kg,0.046309,-0.375495,0.556165,1.0,0.359598,0.069421,0.29786,0.425359,0.207102,0.326738,0.079207
is_legendary,-0.011286,-0.431916,0.300195,0.359598,1.0,0.140133,0.227431,0.244562,0.384133,0.369264,0.355673
hp,-0.083683,-0.066646,0.139841,0.069421,0.140133,1.0,0.106769,0.064412,0.132199,0.078734,0.093937
attack,-0.008515,-0.196162,0.346853,0.29786,0.227431,0.106769,1.0,0.348627,0.309095,0.223341,0.309811
defense,-0.041466,-0.206992,0.31232,0.425359,0.244562,0.064412,0.348627,1.0,0.232295,0.487194,0.04555
sp_attack,0.024439,-0.191422,0.319995,0.207102,0.384133,0.132199,0.309095,0.232295,1.0,0.434077,0.426542
sp_defense,0.00146,-0.22246,0.306693,0.326738,0.369264,0.078734,0.223341,0.487194,0.434077,1.0,0.240783


In [39]:
train['bin_bh']=train['base_happiness']
train['special']=train['sp_attack']+train['sp_defense']
#train['physical']=train['attack']+train['defense']
train['gstats']=train['sp_attack']+train['sp_defense']+train['speed']
#train['superstat']=train['hp']+train['attack']+train['defense']+train['sp_attack']+train['sp_defense']+train['speed']
train['comboAtk']=train['sp_attack']+train['attack']
train['comboDef']=train['defense']+train['sp_defense']
train['bin_bh'] = train['bin_bh'].mask(train['bin_bh'] > 1, 1)

In [40]:
corrmat=train.corr()
corrmat

Unnamed: 0,rowid,base_happiness,height_m,weight_kg,is_legendary,hp,attack,defense,sp_attack,sp_defense,speed,bin_bh,special,gstats,comboAtk,comboDef
rowid,1.0,-0.002708,0.018949,0.046309,-0.011286,-0.083683,-0.008515,-0.041466,0.024439,0.00146,0.059613,0.034315,0.016498,0.038223,0.011712,-0.02428
base_happiness,-0.002708,1.0,-0.295233,-0.375495,-0.431916,-0.066646,-0.196162,-0.206992,-0.191422,-0.22246,-0.203218,0.716773,-0.242328,-0.268196,-0.238715,-0.248534
height_m,0.018949,-0.295233,1.0,0.556165,0.300195,0.139841,0.346853,0.31232,0.319995,0.306693,0.256357,-0.345876,0.370153,0.386092,0.40964,0.358947
weight_kg,0.046309,-0.375495,0.556165,1.0,0.359598,0.069421,0.29786,0.425359,0.207102,0.326738,0.079207,-0.419548,0.308303,0.263003,0.306151,0.438451
is_legendary,-0.011286,-0.431916,0.300195,0.359598,1.0,0.140133,0.227431,0.244562,0.384133,0.369264,0.355673,-0.626459,0.444933,0.484819,0.386093,0.352629
hp,-0.083683,-0.066646,0.139841,0.069421,0.140133,1.0,0.106769,0.064412,0.132199,0.078734,0.093937,-0.10752,0.127209,0.135231,0.148809,0.082609
attack,-0.008515,-0.196162,0.346853,0.29786,0.227431,0.106769,1.0,0.348627,0.309095,0.223341,0.309811,-0.169496,0.318467,0.371103,0.767519,0.334718
defense,-0.041466,-0.206992,0.31232,0.425359,0.244562,0.064412,0.348627,1.0,0.232295,0.487194,0.04555,-0.142665,0.410506,0.324059,0.351519,0.875065
sp_attack,0.024439,-0.191422,0.319995,0.207102,0.384133,0.132199,0.309095,0.232295,1.0,0.434077,0.426542,-0.236919,0.875696,0.83493,0.846872,0.381128
sp_defense,0.00146,-0.22246,0.306693,0.326738,0.369264,0.078734,0.223341,0.487194,0.434077,1.0,0.240783,-0.240547,0.81512,0.70903,0.41747,0.849005


In [41]:
test['bin_bh']=test['base_happiness']
test['special']=test['sp_attack']+test['sp_defense']
#test['physical']=test['attack']+test['defense']
test['gstats']=test['sp_attack']+test['sp_defense']+test['speed']
#test['superstat']=test['hp']+test['attack']+test['defense']+test['sp_attack']+test['sp_defense']+test['speed']
test['comboAtk']=test['sp_attack']+test['attack']
test['comboDef']=test['defense']+test['sp_defense']
test['bin_bh'] = test['bin_bh'].mask(test['bin_bh'] > 1, 1)

In [42]:
train.columns

Index(['rowid', 'unique_id', 'abilities', 'base_happiness', 'height_m',
       'weight_kg', 'is_legendary', 'type1', 'type2', 'hp', 'attack',
       'defense', 'sp_attack', 'sp_defense', 'speed', 'bin_bh', 'special',
       'gstats', 'comboAtk', 'comboDef'],
      dtype='object')

In [43]:
I_am_legend=train['is_legendary']

In [44]:
train=train.drop(columns=['abilities','base_happiness','is_legendary','type1','type2','hp','attack','defense',
                          'sp_attack','sp_defense','speed'])
test=test.drop(columns=['abilities','base_happiness','type1','type2','hp','attack','defense',
                        'sp_attack','sp_defense','speed'])

possible issues that may arrise, the combined stat categories might overfit the data so comboAtk/Def and superstat might overfit, abilities and type may play a role

In [45]:
train.columns

Index(['rowid', 'unique_id', 'height_m', 'weight_kg', 'bin_bh', 'special',
       'gstats', 'comboAtk', 'comboDef'],
      dtype='object')

In [46]:
%store train
%store test
%store I_am_legend

Stored 'train' (DataFrame)
Stored 'test' (DataFrame)
Stored 'I_am_legend' (Series)
