# Change data types with `.astype()` 

### Import Pandas

In [1]:
import pandas as pd

### Read CSV

In [21]:
# when you perform .dropna() it convert the integers to floats ü§∑üèª‚Äç‚ôÇÔ∏è
salaries = pd.read_csv("mls_salaries.csv").dropna(how="all")

### Head

In [22]:
# called the head method to see what were working with
salaries.head(10)

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.0
1,ATL,Ambrose,Mikey,D,65625.0,65625.0
2,ATL,Asad,Yamil,M,150000.0,150000.0
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.0
5,ATL,Carmona,Carlos,M,675000.0,725000.0
6,ATL,Garza,Greg,D,150000.0,150000.0
7,ATL,Gonzalez Pirez,Leandro,D,250008.0,285008.0
8,ATL,Goslin,Chris,M,70000.0,74000.0
9,ATL,Gressel,Julian,,75000.0,93750.0


### Replace null values in the position column

In [23]:
salaries["position"].fillna("none", inplace=True)

In [24]:
salaries.head(10)

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.0
1,ATL,Ambrose,Mikey,D,65625.0,65625.0
2,ATL,Asad,Yamil,M,150000.0,150000.0
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.0
5,ATL,Carmona,Carlos,M,675000.0,725000.0
6,ATL,Garza,Greg,D,150000.0,150000.0
7,ATL,Gonzalez Pirez,Leandro,D,250008.0,285008.0
8,ATL,Goslin,Chris,M,70000.0,74000.0
9,ATL,Gressel,Julian,none,75000.0,93750.0


### Change data types using the `astype()` method

In [25]:
# first call the data types for our DataFrame
salaries.dtypes

club                        object
last_name                   object
first_name                  object
position                    object
base_salary                float64
guaranteed_compensation    float64
dtype: object

### Get info with the `info()` method

In [26]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 614
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club                     614 non-null    object 
 1   last_name                614 non-null    object 
 2   first_name               610 non-null    object 
 3   position                 614 non-null    object 
 4   base_salary              614 non-null    float64
 5   guaranteed_compensation  614 non-null    float64
dtypes: float64(2), object(4)
memory usage: 33.6+ KB


### Change data types using the `astype()` method

In [8]:
# reassign as base salary (no inplace) and select int to change from float to int
salaries["base_salary"] = salaries["base_salary"].astype("int")
salaries["base_salary"]

0      1912500
1        65625
2       150000
3        99225
4        65000
        ...   
609     352000
610     126500
611      80000
612     350000
614     175000
Name: base_salary, Length: 614, dtype: int64

In [9]:
# first call the data types for our DataFrame
salaries["guaranteed_compensation"] = salaries["guaranteed_compensation"].astype("int")
salaries["guaranteed_compensation"]

0      2297000
1        65625
2       150000
3       106573
4        77400
        ...   
609     377000
610     194000
611      80000
612     368125
614     184000
Name: guaranteed_compensation, Length: 614, dtype: int64

In [10]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 614
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   club                     614 non-null    object
 1   last_name                614 non-null    object
 2   first_name               610 non-null    object
 3   position                 614 non-null    object
 4   base_salary              614 non-null    int64 
 5   guaranteed_compensation  614 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 33.6+ KB


### Pandas categories
If you have binary categories, like male/female or 12 months in a million rows, then Pandas categories can save you memory. 

In [11]:
# check the number of potential categories
salaries["position"].nunique()

11

In [12]:
# convert the position column into the category data type, save it as itself(DataFrame:row)
salaries["position"] = salaries["position"].astype("category")

In [13]:
# check the number of potential categories
salaries["club"].nunique()

23

In [14]:
# convert the position column into the category data type, save it as itself(DataFrame:row)
salaries["club"] = salaries["club"].astype("category")

In [15]:
# looks the same, but it's a different data type
salaries # check the .info() memory usage (it's lower!)

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500,2297000
1,ATL,Ambrose,Mikey,D,65625,65625
2,ATL,Asad,Yamil,M,150000,150000
3,ATL,Bloom,Mark,D,99225,106573
4,ATL,Carleton,Andrew,F,65000,77400
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000,377000
610,VAN,Teibert,Russell,M,126500,194000
611,VAN,Tornaghi,Paolo,GK,80000,80000
612,VAN,Waston,Kendall,D,350000,368125


### Get info with the `info()` method

In [16]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 614
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   club                     614 non-null    category
 1   last_name                614 non-null    object  
 2   first_name               610 non-null    object  
 3   position                 614 non-null    category
 4   base_salary              614 non-null    int64   
 5   guaranteed_compensation  614 non-null    int64   
dtypes: category(2), int64(2), object(2)
memory usage: 26.3+ KB
