In [1]:
import os
import datetime

import pandas as pd
import polars as pl
import dask.dataframe as dd

In [2]:
DATA_DIR_PATH = '../data'

In [3]:
SM_BATCH_DIR_PATH = os.path.join(DATA_DIR_PATH, 'batch-100K')

In [4]:
filenames = sorted(os.listdir(SM_BATCH_DIR_PATH), key=lambda x: int(x.split('_')[-1].replace('.csv', '')))

In [5]:
filenames[0]

'batch_0.csv'

In [6]:
today_datetime = datetime.datetime.today()

### Benchmark Exercise

1. Read & Write File
2. Date Operation
3. Math computation
4. Groupby and Count
5. String operation e.g. Replace
6. Sorting

### Pandas

Load Data

In [None]:
d_sm_batch_pd = pd.read_csv(os.path.join(SM_BATCH_DIR_PATH, filenames[0]))

In [None]:
d_sm_batch_pd

In [None]:
# %timeit -r 10 -n 1 d_sm_batch_pd = pd.read_csv(os.path.join(SM_BATCH_DIR_PATH, filenames[0]))

In [None]:
### save timeit to vairable
# timedur = %timeit -r 2 -n 5 -o pd.read_csv(os.path.join(SM_BATCH_DIR_PATH, filenames[0]))

Datetime operation

In [None]:
d_sm_batch_pd.birthdate = pd.to_datetime(d_sm_batch_pd.birthdate)

In [None]:
d_sm_batch_pd['age_in_days'] = (today_datetime - d_sm_batch_pd.birthdate).dt.days

Math operation

In [None]:
d_sm_batch_pd.age_in_days.sum()

Groupby and count

In [None]:
d_sm_batch_pd.groupby('sex').username.count()

String operation

In [None]:
d_sm_batch_pd.mail.str.split('@', expand=True).loc[:, 1].value_counts()

In [None]:
%timeit d_sm_batch_pd.mail.str.replace('.com', '.id')

Sorting

In [None]:
d_sm_batch_pd.sort_values('age_in_days')

### Polars

In [None]:
d_sm_batch_pl = pl.read_csv(os.path.join(SM_BATCH_DIR_PATH, filenames[0]))

In [None]:
d_sm_batch_pl.head()

Datetime Operation

In [None]:
d_sm_batch_pl = d_sm_batch_pl.with_columns(d_sm_batch_pl['birthdate'].str.to_datetime())

In [None]:
d_sm_batch_pl = d_sm_batch_pl.with_columns((today_datetime - d_sm_batch_pl['birthdate']).dt.days().alias('age_in_days'))

Math operation

In [None]:
d_sm_batch_pl['age_in_days'].sum()

Groupby and count

In [None]:
d_sm_batch_pl.group_by('sex').agg(pl.col('username').count().alias('total_users'))

String operation

In [None]:
d_sm_batch_pl = d_sm_batch_pl.with_columns(d_sm_batch_pl['mail'].str.replace('.com', '.co.id'))

Sorting

In [None]:
d_sm_batch_pl.sort('age_in_days')

### Dask

In [8]:
d_sm_batch_dd = dd.read_csv(os.path.join(SM_BATCH_DIR_PATH, filenames[0]))

In [9]:
d_sm_batch_dd.head()

Unnamed: 0,job,company,ssn,blood_group,username,sex,mail,birthdate
0,Emergency planning/management officer,"Washington, Torres and Conner",036-51-3780,B-,samanthafisher,F,jaredlogan@yahoo.com,1925-05-15
1,English as a second language teacher,Hudson-Woods,197-91-9713,A+,fallen,F,chelsea15@yahoo.com,1987-11-13
2,English as a foreign language teacher,Moore-Pratt,226-31-0368,O-,whiteheadchristopher,M,zclayton@yahoo.com,1979-09-19
3,Dealer,Smith Group,041-74-2846,A+,wstewart,M,christophertorres@yahoo.com,1915-03-11
4,Minerals surveyor,"Simmons, Horton and Martin",474-61-9160,O+,valerie23,M,dennis58@gmail.com,1992-12-18


In [10]:
d_sm_batch_dd.dtypes

job            object
company        object
ssn            object
blood_group    object
username       object
sex            object
mail           object
birthdate      object
dtype: object

Datetime operation

In [11]:
d_sm_batch_dd.birthdate = dd.to_datetime(d_sm_batch_dd.birthdate)

In [12]:
d_sm_batch_dd.dtypes

job                    object
company                object
ssn                    object
blood_group            object
username               object
sex                    object
mail                   object
birthdate      datetime64[ns]
dtype: object

In [13]:
d_sm_batch_dd['age_in_days'] = (today_datetime - d_sm_batch_dd.birthdate).dt.days

In [14]:
d_sm_batch_dd.head()

Unnamed: 0,job,company,ssn,blood_group,username,sex,mail,birthdate,age_in_days
0,Emergency planning/management officer,"Washington, Torres and Conner",036-51-3780,B-,samanthafisher,F,jaredlogan@yahoo.com,1925-05-15,35970
1,English as a second language teacher,Hudson-Woods,197-91-9713,A+,fallen,F,chelsea15@yahoo.com,1987-11-13,13143
2,English as a foreign language teacher,Moore-Pratt,226-31-0368,O-,whiteheadchristopher,M,zclayton@yahoo.com,1979-09-19,16120
3,Dealer,Smith Group,041-74-2846,A+,wstewart,M,christophertorres@yahoo.com,1915-03-11,39688
4,Minerals surveyor,"Simmons, Horton and Martin",474-61-9160,O+,valerie23,M,dennis58@gmail.com,1992-12-18,11281


Math operation

In [16]:
d_sm_batch_dd.age_in_days.sum().compute()

2123019047

Groupby and count

In [20]:
d_sm_batch_dd.groupby('sex').username.count().compute()

sex
F    49766
M    50234
Name: username, dtype: int64

String operation

In [23]:
d_sm_batch_dd.mail.str.replace('.com', '.co.id').compute()

0               jaredlogan@yahoo.co.id
1                chelsea15@yahoo.co.id
2                 zclayton@yahoo.co.id
3        christophertorres@yahoo.co.id
4                 dennis58@gmail.co.id
                     ...              
99995          lmatthews@hotmail.co.id
99996           janice54@hotmail.co.id
99997          anthonyryan@yahoo.co.id
99998           daniel30@hotmail.co.id
99999           jennifer20@yahoo.co.id
Name: mail, Length: 100000, dtype: object

Sort operation

In [25]:
d_sm_batch_dd.sort_values('age_in_days').compute()

Unnamed: 0,job,company,ssn,blood_group,username,sex,mail,birthdate,age_in_days
27402,"Psychologist, sport and exercise",Thomas PLC,442-06-7688,AB-,williamsann,M,howepaul@hotmail.com,2023-11-02,5
7798,Careers adviser,Scott-Garrison,453-03-3169,O+,deannasmith,F,debra41@hotmail.com,2023-11-02,5
96738,Materials engineer,Mcdonald-Nelson,503-04-2774,O-,mcknightjean,F,stephenssarah@yahoo.com,2023-11-02,5
49842,Geophysicist/field seismologist,Jackson-Bowman,750-50-5473,AB-,tmoreno,F,michael69@gmail.com,2023-11-01,6
68178,Catering manager,Hughes-Cohen,567-99-9756,AB-,nicholasowens,M,teresa44@hotmail.com,2023-11-01,6
...,...,...,...,...,...,...,...,...,...
11153,Engineering geologist,Smith Inc,298-34-8288,AB+,fwest,F,reneetaylor@gmail.com,1907-11-06,42370
21380,"Journalist, newspaper","Mckenzie, Hicks and Nelson",324-21-1696,A-,jonesjose,F,timothycolon@yahoo.com,1907-11-05,42371
69204,"Engineer, water","Porter, Campbell and Davis",337-29-8801,A+,gloriaking,F,deanfrederick@yahoo.com,1907-11-05,42371
81151,Electrical engineer,"Sutton, Barrera and Olson",615-15-4279,AB-,cobbkristen,F,wjohnson@yahoo.com,1907-11-05,42371
