![](./image/pandas-logo.png)

# Python Pandas: Tips & Tricks

Oleh Channel YouTube [Indonesia Belajar](https://www.youtube.com/IndonesiaBelajarKomputer)

## \#01: Menyertakan Prefix dan Suffix pada seluruh Kolom Data Frame

### Import Modules

In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [2]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,4,3,1,7
1,2,4,9,4,3
2,5,7,2,2,4
3,3,9,2,7,1
4,1,6,7,4,6


In [3]:
tuple('ABCDE')

('A', 'B', 'C', 'D', 'E')

### Menyertakan Prefix Kolom

In [4]:
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,4,4,3,1,7
1,2,4,9,4,3
2,5,7,2,2,4
3,3,9,2,7,1
4,1,6,7,4,6


### Menyertakan Suffix Kolom

In [5]:
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,4,4,3,1,7
1,2,4,9,4,3
2,5,7,2,2,4
3,3,9,2,7,1
4,1,6,7,4,6


## \#02: Pemilihan baris (rows selection) pada Data Frame

### Import Modules

In [6]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [7]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 5, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,3,4,2,2
1,2,4,4,1,3
2,4,2,4,4,4
3,2,3,3,3,2
4,4,1,3,2,4
5,1,3,2,3,3
6,3,4,2,1,1
7,3,3,1,1,2
8,3,4,4,2,1
9,4,2,3,1,4


### Selection dengan operator logika `|`

In [8]:
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
5,1,3,2,3,3
6,3,4,2,1,1
7,3,3,1,1,2
8,3,4,4,2,1


### Selection dengan fungsi `isin()`

In [9]:
df[df['A'].isin([1, 3])]

Unnamed: 0,A,B,C,D,E
5,1,3,2,3,3
6,3,4,2,1,1
7,3,3,1,1,2
8,3,4,4,2,1


### Mengenal operator negasi `~`

In [10]:
df[~df['A'].isin([1, 3])]

Unnamed: 0,A,B,C,D,E
0,4,3,4,2,2
1,2,4,4,1,3
2,4,2,4,4,4
3,2,3,3,3,2
4,4,1,3,2,4
9,4,2,3,1,4


## \#03: Konversi tipe data String ke Numerik pada kolom Data Frame

### Import Modules

In [11]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [12]:
data = {'col1':['1', '2', '3', 'teks'], 
        'col2':['1', '2', '3', '4']}

df = pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [13]:
df.dtypes

col1    object
col2    object
dtype: object

### Konversi tipe data dengan fungsi `astype()`

In [14]:
df_x = df.astype({'col2':'int'})
df_x

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [15]:
df_x.dtypes

col1    object
col2     int64
dtype: object

### Konversi tipe data numerik dengan fungsi `to_numeric()`

In [16]:
df.apply(pd.to_numeric, errors='coerce')

Unnamed: 0,col1,col2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


## \#04: Pemilihan kolom (columns selection) pada Data Frame berdasarkan tipe data

### Import Modules

In [17]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [18]:
n_rows = 5
n_cols = 2
cols = ['bil_pecahan', 'bil_bulat']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df['bil_pecahan'] = df['bil_pecahan'].astype('float')

df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')
df = df.reset_index()

df['teks'] = list('ABCDE')

df

  import pandas.util.testing


Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,13.0,10,A
1,2000-01-01 01:00:00,9.0,12,B
2,2000-01-01 02:00:00,2.0,3,C
3,2000-01-01 03:00:00,13.0,14,D
4,2000-01-01 04:00:00,5.0,10,E


In [19]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int64
teks                   object
dtype: object

### Memilih kolom bertipe data numerik

In [20]:
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,13.0,10
1,9.0,12
2,2.0,3
3,13.0,14
4,5.0,10


In [21]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,13.0
1,9.0
2,2.0
3,13.0
4,5.0


In [22]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,10
1,12
2,3
3,14
4,10


### Memilih kolom bertipe data string atau `object`

In [23]:
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


### Memilih kolom bertipe data `datetime`

In [24]:
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


### Memilih kolom dengan kombinasi tipe data

In [25]:
df.select_dtypes(include=['number', 'object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,13.0,10,A
1,9.0,12,B
2,2.0,3,C
3,13.0,14,D
4,5.0,10,E


## \#05: Membalik urutan baris dan kolom pada Data Frame

### Import Modules

In [26]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [27]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,3,6,8,9,9
1,2,8,7,4,5
2,3,8,1,7,2
3,8,2,5,9,1
4,5,3,4,9,5


### Membalik urutan kolom

In [28]:
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,9,9,8,6,3
1,5,4,7,8,2
2,2,7,1,8,3
3,1,9,5,2,8
4,5,9,4,3,5


### Membalik urutan baris

In [29]:
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,5,3,4,9,5
3,8,2,5,9,1
2,3,8,1,7,2
1,2,8,7,4,5
0,3,6,8,9,9


### Membalik urutan baris dan melakukan penyesuaian ulang `index`

In [30]:
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,5,3,4,9,5
1,8,2,5,9,1
2,3,8,1,7,2
3,2,8,7,4,5
4,3,6,8,9,9


## \#06: Mengganti nama (label) kolom pada Data Frame

### Import Modules

In [31]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [32]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,6,6,4,2,8
1,6,2,8,9,1
2,7,7,2,6,5
3,9,7,7,9,4
4,8,8,1,7,7


### Mengganti nama (label) untuk sebuah kolom pada Data Frame

In [33]:
df.rename(columns={'C':'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,6,6,4,2,8
1,6,2,8,9,1
2,7,7,2,6,5
3,9,7,7,9,4
4,8,8,1,7,7


### Mengganti nama (label) untuk banyak kolom pada Data Frame

In [34]:
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,6,6,4,2,8
1,6,2,8,9,1
2,7,7,2,6,5
3,9,7,7,9,4
4,8,8,1,7,7


## \#07: Menghapus (drop) missing values (`NaN`)

### Import Modules

In [35]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [36]:
df = pd.util.testing.makeMissingDataframe().reset_index() 
df.head()

Unnamed: 0,index,A,B,C,D
0,VTq0dQ7qiR,-1.240871,-0.881183,1.669762,
1,MkuU9OD0f2,-1.449407,0.734637,-0.31541,-0.985389
2,5B9epXlY0j,-0.80312,-0.33192,-0.280107,
3,iOEpqIeygO,-1.05107,0.564272,1.369696,1.281964
4,XSGUiHAsJC,0.639375,1.218602,-0.521624,0.263385


In [37]:
df = df.rename(columns={'index':'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,VTq0dQ7qiR,-1.240871,-0.881183,1.669762,
1,MkuU9OD0f2,-1.449407,0.734637,-0.31541,-0.985389
2,5B9epXlY0j,-0.80312,-0.33192,-0.280107,
3,iOEpqIeygO,-1.05107,0.564272,1.369696,1.281964
4,XSGUiHAsJC,0.639375,1.218602,-0.521624,0.263385


In [38]:
df_backup = df.copy(deep=True)

### Menghapus (drop) setiap kolom yang mengandung missing values

In [39]:
df = df.dropna(axis='columns') 
df.head()

Unnamed: 0,Z
0,VTq0dQ7qiR
1,MkuU9OD0f2
2,5B9epXlY0j
3,iOEpqIeygO
4,XSGUiHAsJC


### Menghapus (drop) setiap baris yang mengandung missing values

In [40]:
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
1,MkuU9OD0f2,-1.449407,0.734637,-0.31541,-0.985389
3,iOEpqIeygO,-1.05107,0.564272,1.369696,1.281964
4,XSGUiHAsJC,0.639375,1.218602,-0.521624,0.263385
5,1fPaY3Z9W0,-1.317631,-0.294588,-0.112121,-0.05763
6,9yYusq7osY,-0.501213,-1.532468,-1.067878,-0.686923


### Persentase missing values untuk tiap kolom

In [41]:
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.033333
C    0.133333
D    0.166667
dtype: float64

### Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan threshold

In [42]:
treshold = len(df) * 0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,B
0,VTq0dQ7qiR,-1.240871,-0.881183
1,MkuU9OD0f2,-1.449407,0.734637
2,5B9epXlY0j,-0.80312,-0.33192
3,iOEpqIeygO,-1.05107,0.564272
4,XSGUiHAsJC,0.639375,1.218602


## \#08: Memeriksa kesamaan antar dua buah kolom (Series) pada Data Frame

### Import Modules

In [43]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [44]:
data = {'A':[15, 15, 18, np.nan, 12], 
        'B':[15, 15, 18, np.nan, 12]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


### Mengenal Pandas Series

In [45]:
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [46]:
type(df['A'])

pandas.core.series.Series

In [47]:
type(df)

pandas.core.frame.DataFrame

### Memeriksa kesamaan dengan operator `==`

In [48]:
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

### Memeriksa kesamaan dengan method `equals()`

In [49]:
df['A'].equals(df['B'])

True

### Memeriksa kesamaan antar dua Data Frame

In [50]:
df1 = df.copy(deep=True)

df.equals(df1)

True

In [51]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


## \#09: Membagi Data Frame menjadi dua secara acak

### Import Modules

In [52]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [53]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,13,5,16,15,16
1,3,12,19,8,4
2,6,3,9,14,2
3,11,15,14,10,9
4,19,7,1,2,15
5,18,16,17,2,15
6,11,7,2,14,11
7,7,7,4,12,16
8,4,7,1,12,17
9,18,16,15,1,1


### Membagi Data Frame menjadi dua secara acak berdasarkan proporsi tertentu

In [54]:
df.shape

(10, 5)

In [55]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 shape: {df_1.shape}')
print(f'df_2 shape: {df_2.shape}')

df_1 shape: (7, 5)
df_2 shape: (3, 5)


In [56]:
df_1

Unnamed: 0,A,B,C,D,E
8,4,7,1,12,17
7,7,7,4,12,16
4,19,7,1,2,15
9,18,16,15,1,1
3,11,15,14,10,9
2,6,3,9,14,2
6,11,7,2,14,11


In [57]:
df_2

Unnamed: 0,A,B,C,D,E
0,13,5,16,15,16
1,3,12,19,8,4
5,18,16,17,2,15


## \#10: Mengganti nama (label) kolom pada Data Frame berdasarkan pola

### Import Modules

In [58]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [59]:
df = pd.read_csv('./data/titanicfull.csv')
df.columns = ['Pclass', 'Survival status', 'full Name', 'Sex  ', '  Age', 
              'Sib SP', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
df_backup = df.copy(deep=True)

df.head()

Unnamed: 0,Pclass,Survival status,full Name,Sex,Age,Sib SP,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Menggunakan lowercase untuk nama kolom dan mengganti spasi dengan `_`

In [60]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex__,__age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Memangkas kelebihan spasi pada nama kolom

In [61]:
df = df_backup.copy(deep=True)

df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


## \#11: Melakukan seleksi kolom dan baris pada Data Frame menggunakan `loc`

### Import Modules

In [62]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [63]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,5,14,17,13,6
1,3,7,3,11,17
2,14,10,9,13,3
3,19,3,14,10,5
4,9,18,18,11,3
5,12,4,3,4,8
6,7,13,9,18,3
7,7,11,19,8,13
8,18,13,11,13,16
9,11,15,5,10,14


### Seleksi kolom dan baris menggunakan `loc`

In [64]:
df.loc[[0,3,4], ['B','E']]

Unnamed: 0,B,E
0,14,6
3,3,5
4,18,3


### Seleksi baris dengan kondisi

In [65]:
df.loc[df['B']>10, ['B','D','E']]

Unnamed: 0,B,D,E
0,14,13,6
4,18,11,3
6,13,18,3
7,11,8,13
8,13,13,16
9,15,10,14


### Slicing Data Frame dengan `loc`

In [66]:
df.loc[0:4, 'B':'D']

Unnamed: 0,B,C,D
0,14,17,13
1,7,3,11
2,10,9,13
3,3,14,10
4,18,18,11


## \#12: Membentuk kolom bertipe `datetime` dari sejumlah kolom lain pada Data Frame

### Import Modules

In [67]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [68]:
data = {'day':[1, 2, 10 ,25, 12], 
        'month':[1, 2, 4, 5, 6], 
        'year':[2000, 2001, 2010, 2015, 2020]}

df = pd.DataFrame(data)
df

Unnamed: 0,day,month,year
0,1,1,2000
1,2,2,2001
2,10,4,2010
3,25,5,2015
4,12,6,2020


### Membentuk kolom bertipe `datetime`

In [69]:
df['penaggalan'] = pd.to_datetime(df[['day', 'month', 'year']])
df

Unnamed: 0,day,month,year,penaggalan
0,1,1,2000,2000-01-01
1,2,2,2001,2001-02-02
2,10,4,2010,2010-04-10
3,25,5,2015,2015-05-25
4,12,6,2020,2020-06-12


In [70]:
df.dtypes

day                    int64
month                  int64
year                   int64
penaggalan    datetime64[ns]
dtype: object

## \#13: Konversi nilai numerik ke dalam sejumlah kategori

### Import Modules

In [71]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [72]:
n_rows = 10
n_cols = 1
cols = ('usia',)

df = pd.DataFrame(np.random.randint(1, 99, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,usia
0,38
1,19
2,35
3,77
4,97
5,63
6,59
7,24
8,27
9,80


### Pengelompokkan nilai numerik ke dalam beberapa kategori menggunakan `cut()`

In [73]:
df['kelompok_usia'] = pd.cut(df['usia'], 
                             bins=[0, 18, 65, 99], 
                             labels=['anak', 'dewasa', 'manula'])
df

Unnamed: 0,usia,kelompok_usia
0,38,dewasa
1,19,dewasa
2,35,dewasa
3,77,manula
4,97,manula
5,63,dewasa
6,59,dewasa
7,24,dewasa
8,27,dewasa
9,80,manula


## \#14: Menggabungkan (merge) dua Data Frame

### Import Modules

In [74]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [75]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,17,12,7,2,14
1,16,4,6,4,16
2,10,9,14,10,11
3,6,1,16,6,17
4,11,2,18,16,9


In [76]:
df1 = df.copy(deep=True)
df1 = df1.drop([1, 4])
df1

Unnamed: 0,A,B,C,D,E
0,17,12,7,2,14
2,10,9,14,10,11
3,6,1,16,6,17


In [77]:
df2 = df.copy(deep=True)
df2 = df2.drop([0, 3])
df2

Unnamed: 0,A,B,C,D,E
1,16,4,6,4,16
2,10,9,14,10,11
4,11,2,18,16,9


### Menggabungkan dua Data Frame

In [78]:
df_inner = pd.merge(df1, df2, how='inner')
df_inner

Unnamed: 0,A,B,C,D,E
0,10,9,14,10,11


In [79]:
df_outer = pd.merge(df1, df2, how='outer')
df_outer

Unnamed: 0,A,B,C,D,E
0,17,12,7,2,14
1,10,9,14,10,11
2,6,1,16,6,17
3,16,4,6,4,16
4,11,2,18,16,9


## \#15: Memecah nilai string dari suatu kolom ke dalam beberapa kolom baru

### Import Modules

In [80]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [81]:
data = {'nama':['Didi Kempot', 'Glenn Fredly', 'Mbah Surip'], 
        'tempat_kelahiran':['Surakarta, Jawa Tengah', 'Jakarta, DKI Jakarta', 'Mojokerto, Jawa Timur']}
df = pd.DataFrame(data)
df

Unnamed: 0,nama,tempat_kelahiran
0,Didi Kempot,"Surakarta, Jawa Tengah"
1,Glenn Fredly,"Jakarta, DKI Jakarta"
2,Mbah Surip,"Mojokerto, Jawa Timur"


### Memecah nama depan dan nama belakang

In [82]:
df[['nama_depan', 'nama_belakang']] = df['nama'].str.split(' ', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang
0,Didi Kempot,"Surakarta, Jawa Tengah",Didi,Kempot
1,Glenn Fredly,"Jakarta, DKI Jakarta",Glenn,Fredly
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip


### Memecah nama kota dan propinsi

In [83]:
df[['kota', 'propinsi']] = df['tempat_kelahiran'].str.split(',', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang,kota,propinsi
0,Didi Kempot,"Surakarta, Jawa Tengah",Didi,Kempot,Surakarta,Jawa Tengah
1,Glenn Fredly,"Jakarta, DKI Jakarta",Glenn,Fredly,Jakarta,DKI Jakarta
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip,Mojokerto,Jawa Timur


## \#16: Menata ulang Data Frame dengan mutiple indexes menggunakan `unstack()`

### Import Modules

In [84]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [85]:
df = pd.read_csv('./data/titanicfull.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Data Frame dengan multiple indexes dari hasil groupping

In [86]:
df.groupby(['sex', 'pclass'])['survived'].mean().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.965278
female,2,0.886792
female,3,0.490741
male,1,0.340782
male,2,0.146199
male,3,0.15213


### Menata ulang Data Frame dengan mutiple indexes

In [87]:
df.groupby(['sex', 'pclass'])['survived'].mean().unstack()

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.965278,0.886792,0.490741
male,0.340782,0.146199,0.15213


## \#17: Resampling pada data deret waktu (time series data)

### Import Modules

In [88]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [89]:
n_rows = 365 * 24
n_cols = 2
cols = ['col1', 'col2']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)

df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')
df

Unnamed: 0,col1,col2
2000-01-01 00:00:00,10,4
2000-01-01 01:00:00,1,11
2000-01-01 02:00:00,16,10
2000-01-01 03:00:00,16,12
2000-01-01 04:00:00,15,15
...,...,...
2000-12-30 19:00:00,7,18
2000-12-30 20:00:00,1,5
2000-12-30 21:00:00,8,1
2000-12-30 22:00:00,10,18


### Resampling data dengan interval monthly

In [90]:
df.resample('M')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-31,7469
2000-02-29,6861
2000-03-31,7563
2000-04-30,7284
2000-05-31,7758
2000-06-30,7393
2000-07-31,7437
2000-08-31,7250
2000-09-30,7005
2000-10-31,7527


### Resampling data dengan interval daily

In [91]:
df.resample('D')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-01,191
2000-01-02,201
2000-01-03,248
2000-01-04,282
2000-01-05,231
...,...
2000-12-26,196
2000-12-27,267
2000-12-28,208
2000-12-29,228


## \#18: Membentuk dummy Data Frame

### Import Modules

In [92]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Membentuk Data Frame dari Dictionary

In [93]:
pd.DataFrame({'col1':[1, 2, 3, 4], 
              'col2':[5, 6, 7, 8]})

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


### Membentuk Data Frame dari Numpy Array

In [94]:
n_rows = 5
n_cols = 3

arr = np.random.randint(1, 20, size=(n_rows, n_cols))
arr

array([[ 1,  6,  4],
       [16, 17, 14],
       [ 3,  7, 18],
       [10,  3,  5],
       [17,  4, 14]])

In [95]:
pd.DataFrame(arr, columns=tuple('ABC'))

Unnamed: 0,A,B,C
0,1,6,4
1,16,17,14
2,3,7,18
3,10,3,5
4,17,4,14


### Membentuk Data Frame dengan memanfaatkan  `pandas.util.testing`

In [96]:
pd.util.testing.makeDataFrame().head()

Unnamed: 0,A,B,C,D
PbwA8Tz4BI,0.034395,-0.40993,-1.069936,0.489168
nR1NBPSYbC,0.037107,-1.427583,-0.657249,0.898125
NX7L6KiLWI,0.219804,-1.278369,0.736833,0.948884
P7K5agJR7b,0.351991,-1.909514,0.397154,0.654226
jCOi9IUIgS,-1.089228,-0.28546,0.062171,0.52957


In [97]:
pd.util.testing.makeMixedDataFrame().head()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [98]:
pd.util.testing.makeTimeDataFrame().head()

Unnamed: 0,A,B,C,D
2000-01-03,0.173314,0.31177,0.425143,-0.316535
2000-01-04,0.278252,1.295201,-0.677114,-0.304508
2000-01-05,0.566637,-0.406458,-0.140704,0.646388
2000-01-06,1.900556,0.64302,-0.087379,-1.783364
2000-01-07,-1.553719,2.195953,-0.164221,-0.341381


In [99]:
pd.util.testing.makeMissingDataframe().head()

Unnamed: 0,A,B,C,D
tnCnebCoXb,-0.144193,1.271653,0.159716,-1.411894
Xa3c0EBPHc,0.696776,-0.036567,-1.732487,0.610717
LhoauLh3al,,-0.491734,-0.739623,-2.557186
vc7QiQVcqD,-1.244218,0.247974,0.864106,-0.252261
VzcKvs46tG,0.523092,-0.422686,1.140614,1.592837


## \#19: Formatting tampilan Data Frame

### Import Modules

In [100]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [101]:
n_rows = 5
n_cols = 2
cols = ['omset', 'operasional']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,omset,operasional
0,1,7
1,14,15
2,11,1
3,16,9
4,17,12


In [102]:
df['omset'] = df['omset'] * 100_000
df['operasional'] = df['operasional'] * 10_000
df

Unnamed: 0,omset,operasional
0,100000,70000
1,1400000,150000
2,1100000,10000
3,1600000,90000
4,1700000,120000


In [103]:
df.index = pd.util.testing.makeDateIndex(n_rows, freq='D')
df = df.reset_index()
df = df.rename(columns={'index':'tanggal'})
df

Unnamed: 0,tanggal,omset,operasional
0,2000-01-01,100000,70000
1,2000-01-02,1400000,150000
2,2000-01-03,1100000,10000
3,2000-01-04,1600000,90000
4,2000-01-05,1700000,120000


### Melakukan formatting tampilan Data Frame

In [104]:
formatku = {'tanggal':'{:%d/%m/%y}', 
            'operasional':'Rp {:.2f}',
            'omset':'Rp {:.2f}'}

laporan = df.style.format(formatku)
laporan

Unnamed: 0,tanggal,omset,operasional
0,01/01/00,Rp 100000.00,Rp 70000.00
1,02/01/00,Rp 1400000.00,Rp 150000.00
2,03/01/00,Rp 1100000.00,Rp 10000.00
3,04/01/00,Rp 1600000.00,Rp 90000.00
4,05/01/00,Rp 1700000.00,Rp 120000.00


In [105]:
type(laporan)

pandas.io.formats.style.Styler

In [106]:
laporan.hide_index()

tanggal,omset,operasional
01/01/00,Rp 100000.00,Rp 70000.00
02/01/00,Rp 1400000.00,Rp 150000.00
03/01/00,Rp 1100000.00,Rp 10000.00
04/01/00,Rp 1600000.00,Rp 90000.00
05/01/00,Rp 1700000.00,Rp 120000.00


In [107]:
laporan.set_caption('Data Omset dan Operasional')

tanggal,omset,operasional
01/01/00,Rp 100000.00,Rp 70000.00
02/01/00,Rp 1400000.00,Rp 150000.00
03/01/00,Rp 1100000.00,Rp 10000.00
04/01/00,Rp 1600000.00,Rp 90000.00
05/01/00,Rp 1700000.00,Rp 120000.00


In [108]:
laporan.highlight_min('omset', color='pink')
laporan.highlight_max('omset', color='lightgreen')

laporan.highlight_min('operasional', color='lightblue')
laporan.highlight_max('operasional', color='grey')

tanggal,omset,operasional
01/01/00,Rp 100000.00,Rp 70000.00
02/01/00,Rp 1400000.00,Rp 150000.00
03/01/00,Rp 1100000.00,Rp 10000.00
04/01/00,Rp 1600000.00,Rp 90000.00
05/01/00,Rp 1700000.00,Rp 120000.00


## \#20: Menggabungkan (merge) dua Data Frame secara berdampingan

### Import Modules

In [109]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [110]:
d1 = {'col1':[1, 2, 3], 
      'col2':[10, 20, 30]}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,col1,col2
0,1,10
1,2,20
2,3,30


In [111]:
d2 = {'col3':[4, 5, 6], 
      'col4':[40, 50, 60]}
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,col3,col4
0,4,40
1,5,50
2,6,60


### Menggabungkan (merge) dua Data Frame secara berdampingan

In [112]:
df = pd.merge(df1, df2, left_index=True, right_index=True)
df

Unnamed: 0,col1,col2,col3,col4
0,1,10,4,40
1,2,20,5,50
2,3,30,6,60


## \#21: Melakukan agregasi menggunakan `agg()`

### Import Modules

In [113]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [114]:
df = pd.read_csv('./data/Iris.csv')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


### Mengenal `groupby()` dan fungsi agregasi

In [115]:
df.groupby('Species')['PetalLengthCm'].count().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,50
Iris-versicolor,50
Iris-virginica,50


In [116]:
df.groupby('Species')['PetalLengthCm'].mean().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [117]:
df.groupby('Species')['PetalLengthCm'].median().to_frame()

Unnamed: 0_level_0,PetalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,1.5
Iris-versicolor,4.35
Iris-virginica,5.55


### Agregasi dengan `agg()`

In [118]:
df.groupby('Species')['PetalLengthCm'].agg(['count', 'mean', 'median'])

Unnamed: 0_level_0,count,mean,median
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,50,1.464,1.5
Iris-versicolor,50,4.26,4.35
Iris-virginica,50,5.552,5.55


### Agregasi dengan `describe()`

In [119]:
df.groupby('Species')['PetalLengthCm'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Iris-setosa,50.0,1.464,0.173511,1.0,1.4,1.5,1.575,1.9
Iris-versicolor,50.0,4.26,0.469911,3.0,4.0,4.35,4.6,5.1
Iris-virginica,50.0,5.552,0.551895,4.5,5.1,5.55,5.875,6.9


## \#22: Memantau penggunaan memory dari suatu Data Frame

### Import Modules

In [120]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [121]:
df_titanic = pd.read_csv('./data/titanicfull.csv')
df_titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [122]:
df_iris = pd.read_csv('./data/Iris.csv')
df_iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


### Memantau penggunaan memory dari suatu Data Frame

In [123]:
df_titanic.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   int64  
 6   parch     1309 non-null   int64  
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 463.0 KB


In [124]:
df_iris.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 16.3 KB


### Memantau penggunaan memory untuk setiap kolom dari suatu Data Frame

In [125]:
df_titanic.memory_usage(deep=True)

Index          128
pclass       10472
survived     10472
name        110127
sex          80781
age          10472
sibsp        10472
parch        10472
ticket       83502
fare         10472
cabin        50414
embarked     86326
dtype: int64

In [126]:
df_iris.memory_usage(deep=True)

Index              128
Id                1200
SepalLengthCm     1200
SepalWidthCm      1200
PetalLengthCm     1200
PetalWidthCm      1200
Species          10550
dtype: int64

## \#23: Seleksi baris pada Data Frame dengan `query()`

### Import Modules

In [127]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [128]:
d = {'kolom_satu':[1, 2, 3, 4, 5], 
     'kolom dua':[10, 20, 30, 40, 50]}
df = pd.DataFrame(d)
df

Unnamed: 0,kolom_satu,kolom dua
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


### Seleksi baris dengan `query()`

In [129]:
df.query('kolom_satu > 2')

Unnamed: 0,kolom_satu,kolom dua
2,3,30
3,4,40
4,5,50


In [130]:
df.query('`kolom dua` > 30')

Unnamed: 0,kolom_satu,kolom dua
3,4,40
4,5,50
