# Optimizing DataFrames

### Import Pandas

In [2]:
import pandas as pd

### Employee data


In [3]:
# df = pd.read_csv("employee.csv", parse_dates = ["start_date"])
df = pd.read_csv("employee.csv")

### Head

In [4]:
# called the head method to see what were working with
df.head()

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
0,Haleigh,Calderhead,334473,5/9/2020,,True,management
1,Coretta,McEvon,637457,3/20/2020,Male,False,engineering
2,Clarette,Tarbett,977749,11/22/2020,Agender,True,engineering
3,Jaime,Gianneschi,253523,9/2/2020,Bigender,False,marketing
4,Ediva,Skelton,325185,2/4/2020,Female,True,marketing


### Get Info

In [5]:
# fisrt, return some basic info about your data, some null data in start_date and gender
# Date is a string
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  1000 non-null   object
 1   last_name   1000 non-null   object
 2   salary      1000 non-null   int64 
 3   start_date  976 non-null    object
 4   gender      966 non-null    object
 5   remote      1000 non-null   bool  
 6   team        1000 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 48.0+ KB


### Convert String Dates using the `datetime()` method

In [6]:
# pass in the start_date column
pd.to_datetime(df['start_date'])

0     2020-05-09
1     2020-03-20
2     2020-11-22
3     2020-09-02
4     2020-02-04
         ...    
995   2020-10-18
996   2020-12-28
997   2020-10-26
998   2020-09-08
999   2020-03-29
Name: start_date, Length: 1000, dtype: datetime64[ns]

In [7]:
# the changes are not permanent so the start_date is still a string
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  1000 non-null   object
 1   last_name   1000 non-null   object
 2   salary      1000 non-null   int64 
 3   start_date  976 non-null    object
 4   gender      966 non-null    object
 5   remote      1000 non-null   bool  
 6   team        1000 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 48.0+ KB


### Overwrite the original DataFrame with read_cvs()

In [6]:
df = pd.read_csv("employee.csv", parse_dates = ["start_date"])
df['start_date']

0     2020-05-09
1     2020-03-20
2     2020-11-22
3     2020-09-02
4     2020-02-04
         ...    
995   2020-10-18
996   2020-12-28
997   2020-10-26
998   2020-09-08
999   2020-03-29
Name: start_date, Length: 1000, dtype: datetime64[ns]

In [7]:
df.head()

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
0,Haleigh,Calderhead,334473,2020-05-09,,True,management
1,Coretta,McEvon,637457,2020-03-20,Male,False,engineering
2,Clarette,Tarbett,977749,2020-11-22,Agender,True,engineering
3,Jaime,Gianneschi,253523,2020-09-02,Bigender,False,marketing
4,Ediva,Skelton,325185,2020-02-04,Female,True,marketing


### Optimization

In [24]:
# make gender a category and not an object(string)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   first_name  1000 non-null   object        
 1   last_name   1000 non-null   object        
 2   salary      1000 non-null   int64         
 3   start_date  976 non-null    datetime64[ns]
 4   gender      966 non-null    object        
 5   remote      1000 non-null   bool          
 6   team        1000 non-null   object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(4)
memory usage: 48.0+ KB


In [25]:
df["gender"].astype("category")

0              NaN
1             Male
2          Agender
3         Bigender
4           Female
          ...     
995     Polygender
996    Genderfluid
997    Genderqueer
998         Female
999     Polygender
Name: gender, Length: 1000, dtype: category
Categories (8, object): [Agender, Bigender, Female, Genderfluid, Genderqueer, Male, Non-binary, Polygender]

In [26]:
# reassign the column to save the changes
df["gender"] = df["gender"].astype("category")

In [27]:
# check out the optimization savings!
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   first_name  1000 non-null   object        
 1   last_name   1000 non-null   object        
 2   salary      1000 non-null   int64         
 3   start_date  976 non-null    datetime64[ns]
 4   gender      966 non-null    category      
 5   remote      1000 non-null   bool          
 6   team        1000 non-null   object        
dtypes: bool(1), category(1), datetime64[ns](1), int64(1), object(3)
memory usage: 41.5+ KB
