In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Pokemon.csv", index_col='#')
df

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


### astype()
In python there are many datatypes which we can convert.
so, to convert a data type to another in dataframe we can use `astype()` method

* Note: You can't convert 'NaN' into numbers
* Conversion need to still make sense, converting the string 'text' toa float won't work.

In [3]:
# lets find out datatype in our dataframe
df.dtypes

Name          object
Type 1        object
Type 2        object
Total          int64
HP             int64
Attack         int64
Defense        int64
Sp. Atk        int64
Sp. Def        int64
Speed          int64
Generation     int64
Legendary       bool
dtype: object

In [4]:
# another way is to find is with method - info()
df.info()

# Notice below:
# we got dtypes: bool(1), int64(8), object(3)
# it means we have one Boolean value column, 8 integer columns and 3 string(object) columns.

# and int64 takes a lot of memory as compare to int8

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 721
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        800 non-null    object
 1   Type 1      800 non-null    object
 2   Type 2      414 non-null    object
 3   Total       800 non-null    int64 
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int64 
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int64(8), object(3)
memory usage: 75.8+ KB


In [5]:
# I noticed in Generation column we dont have any big values
# lets try to count the different values we have
df['Generation'].value_counts()

# so we just have 1 to 6 values
# and 1 is repeated 166 times, 5 is repeated 165 times and so on...

1    166
5    165
3    160
4    121
2    106
6     82
Name: Generation, dtype: int64

In [7]:
# so we can convet it into int8 and it is a single value digit
df['Generation'] = df['Generation'].astype('int8')
df.info()

# we can also see the Output: memory usage: 70.3+ KB, and earlier it was 75KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 721
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        800 non-null    object
 1   Type 1      800 non-null    object
 2   Type 2      414 non-null    object
 3   Total       800 non-null    int64 
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int8  
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int64(7), int8(1), object(3)
memory usage: 70.3+ KB


In [8]:
# lets try to reduce a bit more size

df['Total'] = df['Total'].astype('int16')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 721
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        800 non-null    object
 1   Type 1      800 non-null    object
 2   Type 2      414 non-null    object
 3   Total       800 non-null    int16 
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int8  
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int16(1), int64(6), int8(1), object(3)
memory usage: 65.6+ KB


We can change Boolean into int, and True will be 1 and False will be 0.
But changing Boolean into int wont affect any memory usage, because True and False are alias to 1 and 0 in python.

In [9]:
# lets change True and False to 1 and 0
df['Legendary'] = df['Legendary'].astype('int8')
df['Legendary'].head()

#
1    0
2    0
3    0
3    0
4    0
Name: Legendary, dtype: int8

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 721
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        800 non-null    object
 1   Type 1      800 non-null    object
 2   Type 2      414 non-null    object
 3   Total       800 non-null    int16 
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int8  
 11  Legendary   800 non-null    int8  
dtypes: int16(1), int64(6), int8(2), object(3)
memory usage: 65.6+ KB
