#17: Resampling pada data deret waktu (time series data)

In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

2.3.3
2.3.4


Persiapan Data Frame

In [2]:
n_rows = 365 * 24 
n_cols = 2         
cols = ['col1', 'col2'] 

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), columns=cols)

df.index = pd.date_range(start='2023-01-01', periods=n_rows, freq='h')

df.head() 

Unnamed: 0,col1,col2
2023-01-01 00:00:00,18,14
2023-01-01 01:00:00,1,4
2023-01-01 02:00:00,1,5
2023-01-01 03:00:00,1,10
2023-01-01 04:00:00,9,16


Resampling data dengan interval monthly

In [3]:
df.resample('M')['col1'].sum().to_frame()

  df.resample('M')['col1'].sum().to_frame()


Unnamed: 0,col1
2023-01-31,7480
2023-02-28,6653
2023-03-31,7257
2023-04-30,6844
2023-05-31,7473
2023-06-30,7316
2023-07-31,7439
2023-08-31,7642
2023-09-30,6971
2023-10-31,7611


Resampling data dengan interval daily

In [4]:
df.resample('D')['col1'].sum().to_frame()

Unnamed: 0,col1
2023-01-01,188
2023-01-02,257
2023-01-03,201
2023-01-04,203
2023-01-05,251
...,...
2023-12-27,300
2023-12-28,228
2023-12-29,229
2023-12-30,261


#18: Membentuk dummy Data Frame

In [5]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

2.3.3
2.3.4


Membentuk Data Frame dari Dictionary

In [6]:
pd.DataFrame({'col1':[1, 2, 3, 4], 
              'col2':[5, 6, 7, 8]})

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


Membentuk Data Frame dari Numpy Array

In [7]:
n_rows = 5
n_cols = 3

arr = np.random.randint(1, 20, size=(n_rows, n_cols))
arr

array([[12, 11, 10],
       [ 3, 19, 17],
       [17, 11, 16],
       [ 5,  7, 15],
       [ 1,  6, 10]], dtype=int32)

In [8]:
pd.DataFrame(arr, columns=tuple('ABC'))

Unnamed: 0,A,B,C
0,12,11,10
1,3,19,17
2,17,11,16
3,5,7,15
4,1,6,10


Membentuk Data Frame dengan memanfaatkan pandas.util.testing

In [9]:
n_rows = 30 
n_cols = 4 
columns = list('ABCD')

data = np.random.randn(n_rows, n_cols)

df = pd.DataFrame(data, columns=columns)

print(df.head())

          A         B         C         D
0 -0.734230  1.070015  0.714601  0.884365
1  1.674984 -1.045617  1.822500 -0.199478
2  2.647135 -0.320775  0.373274 -0.311422
3 -0.075223 -0.007951  1.341287  0.318822
4  0.027745 -1.388964  0.391562  0.110875


In [10]:
n_rows = 5 

data = {
    'A': np.random.randn(n_rows),                 
    'B': np.arange(1, n_rows + 1),                
    'C': pd.date_range('2024-01-01', periods=n_rows), 
    'D': ['foo', 'bar', 'baz', 'qux', 'quux']    
}

df = pd.DataFrame(data)

print(df.head())

          A  B          C     D
0  0.626124  1 2024-01-01   foo
1  0.732556  2 2024-01-02   bar
2  0.133398  3 2024-01-03   baz
3  0.932431  4 2024-01-04   qux
4  0.097393  5 2024-01-05  quux


In [11]:
n_rows = 30  
n_cols = 4   
columns = list('ABCD')  

time_index = pd.date_range(start='2024-01-01', periods=n_rows, freq='D')  

df = pd.DataFrame(np.random.randn(n_rows, n_cols), index=time_index, columns=columns)

print(df.head())

                   A         B         C         D
2024-01-01 -0.531071 -0.243738  1.441691 -0.500639
2024-01-02 -0.284303 -0.267422 -0.226213 -0.323171
2024-01-03  0.737388  0.882070  1.622891 -1.476931
2024-01-04  1.543930  0.361289 -0.327607  1.111963
2024-01-05 -0.382189  0.132574 -1.045349  0.951541


In [12]:
n_rows = 5
n_cols = 4

data = np.random.randn(n_rows, n_cols)

df = pd.DataFrame(data, columns=list('ABCD'))

df.iloc[0, 1] = np.nan
df.iloc[2, 3] = np.nan
df.iloc[4, 0] = np.nan

print(df.head())

          A         B         C         D
0  0.239076       NaN  1.850433 -0.284641
1 -0.308749  1.098825  0.248920  0.298586
2  0.648895  0.281712 -0.081629       NaN
3  0.025357  1.304513 -1.683149  0.046446
4       NaN  0.103152 -0.115258  0.334129


#19: Formatting tampilan Data Frame

In [13]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

2.3.3
2.3.4


Persiapan Data Frame

In [14]:
n_rows = 5
n_cols = 2
cols = ['omset', 'operasional']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,omset,operasional
0,18,3
1,4,2
2,1,13
3,16,18
4,19,5


In [15]:
df['omset'] = df['omset'] * 100_000
df['operasional'] = df['operasional'] * 10_000
df

Unnamed: 0,omset,operasional
0,1800000,30000
1,400000,20000
2,100000,130000
3,1600000,180000
4,1900000,50000


In [37]:
n_rows = 365  

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, 2)), columns=['col1', 'col2'])

df.index = pd.date_range(start='2024-01-01', periods=n_rows, freq='D')

df = df.reset_index().rename(columns={'index': 'tanggal'})

print(df)

       tanggal  col1  col2
0   2024-01-01    11    15
1   2024-01-02    19    11
2   2024-01-03     2     1
3   2024-01-04     9     6
4   2024-01-05    11     4
..         ...   ...   ...
360 2024-12-26    11    17
361 2024-12-27     2     8
362 2024-12-28    17     1
363 2024-12-29    16     1
364 2024-12-30     3    17

[365 rows x 3 columns]


Melakukan formatting tampilan Data Frame

In [54]:
formatku = {
    'tanggal': '{:%d/%m/%y}',
    'col1': 'Rp {:.2f}',
    'col2': 'Rp {:.2f}'
}



#20: Menggabungkan (merge) dua Data Frame secara berdampingan

In [49]:
import pandas as pd

print(pd.__version__)

2.3.3


Persiapan Data Frame

In [50]:
d1 = {'col1':[1, 2, 3], 
      'col2':[10, 20, 30]}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,col1,col2
0,1,10
1,2,20
2,3,30


In [51]:
d2 = {'col3':[4, 5, 6], 
      'col4':[40, 50, 60]}
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,col3,col4
0,4,40
1,5,50
2,6,60


Menggabungkan (merge) dua Data Frame secara berdampingan

In [52]:
df = pd.merge(df1, df2, left_index=True, right_index=True)
df

Unnamed: 0,col1,col2,col3,col4
0,1,10,4,40
1,2,20,5,50
2,3,30,6,60
