![](./image/pandas-logo.png)

# Python Pandas: Tips & Tricks

Oleh Channel YouTube [Indonesia Belajar](https://www.youtube.com/IndonesiaBelajarKomputer)

## \#01: Menyertakan Prefix dan Suffix pada seluruh Kolom Data Frame

### Import Modules

In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [2]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,7,3,6,8,7
1,6,2,7,5,7
2,5,8,5,8,3
3,3,9,5,7,9
4,3,9,9,4,2


In [3]:
tuple('ABCDE')

('A', 'B', 'C', 'D', 'E')

### Menyertakan Prefix Kolom

In [4]:
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,7,3,6,8,7
1,6,2,7,5,7
2,5,8,5,8,3
3,3,9,5,7,9
4,3,9,9,4,2


### Menyertakan Suffix Kolom

In [5]:
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,7,3,6,8,7
1,6,2,7,5,7
2,5,8,5,8,3
3,3,9,5,7,9
4,3,9,9,4,2


## \#02: Pemilihan baris (rows selection) pada Data Frame

### Import Modules

In [6]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [7]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 5, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,1,4,2,2,2
1,2,4,3,4,4
2,3,2,1,4,4
3,2,4,2,1,4
4,4,1,3,3,3
5,4,4,4,1,1
6,3,2,2,3,4
7,1,4,3,4,3
8,3,2,2,2,3
9,3,4,1,3,1


### Selection dengan operator logika `|`

In [8]:
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
0,1,4,2,2,2
2,3,2,1,4,4
6,3,2,2,3,4
7,1,4,3,4,3
8,3,2,2,2,3
9,3,4,1,3,1


### Selection dengan fungsi `isin()`

In [9]:
df[df['A'].isin([1, 3])]

Unnamed: 0,A,B,C,D,E
0,1,4,2,2,2
2,3,2,1,4,4
6,3,2,2,3,4
7,1,4,3,4,3
8,3,2,2,2,3
9,3,4,1,3,1


### Mengenal operator negasi `~`

In [10]:
df[~df['A'].isin([1, 3])]

Unnamed: 0,A,B,C,D,E
1,2,4,3,4,4
3,2,4,2,1,4
4,4,1,3,3,3
5,4,4,4,1,1


## \#03: Konversi tipe data String ke Numerik pada kolom Data Frame

### Import Modules

In [11]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [12]:
data = {'col1':['1', '2', '3', 'teks'], 
        'col2':['1', '2', '3', '4']}

df = pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [13]:
df.dtypes

col1    object
col2    object
dtype: object

### Konversi tipe data dengan fungsi `astype()`

In [14]:
df_x = df.astype({'col2':'int'})
df_x

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,teks,4


In [15]:
df_x.dtypes

col1    object
col2     int64
dtype: object

### Konversi tipe data numerik dengan fungsi `to_numeric()`

In [16]:
df.apply(pd.to_numeric, errors='coerce')

Unnamed: 0,col1,col2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


## \#04: Pemilihan kolom (columns selection) pada Data Frame berdasarkan tipe data

### Import Modules

In [17]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [18]:
n_rows = 5
n_cols = 2
cols = ['bil_pecahan', 'bil_bulat']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df['bil_pecahan'] = df['bil_pecahan'].astype('float')

df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')
df = df.reset_index()

df['teks'] = list('ABCDE')

df

  import pandas.util.testing


Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,8.0,8,A
1,2000-01-01 01:00:00,16.0,1,B
2,2000-01-01 02:00:00,4.0,6,C
3,2000-01-01 03:00:00,1.0,8,D
4,2000-01-01 04:00:00,16.0,4,E


In [19]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int64
teks                   object
dtype: object

### Memilih kolom bertipe data numerik

In [20]:
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,8.0,8
1,16.0,1
2,4.0,6
3,1.0,8
4,16.0,4


In [21]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,8.0
1,16.0
2,4.0
3,1.0
4,16.0


In [22]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,8
1,1
2,6
3,8
4,4


### Memilih kolom bertipe data string atau `object`

In [23]:
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


### Memilih kolom bertipe data `datetime`

In [24]:
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


### Memilih kolom dengan kombinasi tipe data

In [25]:
df.select_dtypes(include=['number', 'object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,8.0,8,A
1,16.0,1,B
2,4.0,6,C
3,1.0,8,D
4,16.0,4,E


## \#05: Membalik urutan baris dan kolom pada Data Frame

### Import Modules

In [26]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [27]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,7,6,3,2,1
1,8,1,9,9,8
2,4,8,8,1,9
3,7,4,9,6,2
4,8,6,7,8,1


### Membalik urutan kolom

In [28]:
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,1,2,3,6,7
1,8,9,9,1,8
2,9,1,8,8,4
3,2,6,9,4,7
4,1,8,7,6,8


### Membalik urutan baris

In [29]:
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,8,6,7,8,1
3,7,4,9,6,2
2,4,8,8,1,9
1,8,1,9,9,8
0,7,6,3,2,1


### Membalik urutan baris dan melakukan penyesuaian ulang `index`

In [30]:
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,8,6,7,8,1
1,7,4,9,6,2
2,4,8,8,1,9
3,8,1,9,9,8
4,7,6,3,2,1


## \#06: Mengganti nama (label) kolom pada Data Frame

### Import Modules

In [31]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [32]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,3,7,3,1,3
1,5,8,3,4,6
2,9,5,7,3,1
3,1,6,5,9,5
4,5,9,7,3,1


### Mengganti nama (label) untuk sebuah kolom pada Data Frame

In [33]:
df.rename(columns={'C':'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,3,7,3,1,3
1,5,8,3,4,6
2,9,5,7,3,1
3,1,6,5,9,5
4,5,9,7,3,1


### Mengganti nama (label) untuk banyak kolom pada Data Frame

In [34]:
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,3,7,3,1,3
1,5,8,3,4,6
2,9,5,7,3,1
3,1,6,5,9,5
4,5,9,7,3,1


## \#07: Menghapus (drop) missing values (`NaN`)

### Import Modules

In [35]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [36]:
df = pd.util.testing.makeMissingDataframe().reset_index() 
df.head()

Unnamed: 0,index,A,B,C,D
0,900QXWwMcv,1.079881,-0.034375,0.885965,
1,9CbqXzuwXX,-0.983803,-1.028873,0.189076,-0.913614
2,BFiW2SP7yL,1.087102,-0.664897,1.253896,
3,wGILDW3bH5,-0.028798,0.081001,0.06655,-2.121976
4,GG9qvoxTNf,0.904447,0.587859,0.277012,0.659995


In [37]:
df = df.rename(columns={'index':'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,900QXWwMcv,1.079881,-0.034375,0.885965,
1,9CbqXzuwXX,-0.983803,-1.028873,0.189076,-0.913614
2,BFiW2SP7yL,1.087102,-0.664897,1.253896,
3,wGILDW3bH5,-0.028798,0.081001,0.06655,-2.121976
4,GG9qvoxTNf,0.904447,0.587859,0.277012,0.659995


In [38]:
df_backup = df.copy(deep=True)

### Menghapus (drop) setiap kolom yang mengandung missing values

In [39]:
df = df.dropna(axis='columns') 
df.head()

Unnamed: 0,Z,C
0,900QXWwMcv,0.885965
1,9CbqXzuwXX,0.189076
2,BFiW2SP7yL,1.253896
3,wGILDW3bH5,0.06655
4,GG9qvoxTNf,0.277012


### Menghapus (drop) setiap baris yang mengandung missing values

In [40]:
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
1,9CbqXzuwXX,-0.983803,-1.028873,0.189076,-0.913614
3,wGILDW3bH5,-0.028798,0.081001,0.06655,-2.121976
4,GG9qvoxTNf,0.904447,0.587859,0.277012,0.659995
6,05tbVnrJIg,0.535004,0.8415,-0.635333,2.959191
7,suyzeJZ9FN,-1.395371,0.184789,-0.840233,0.19624


### Persentase missing values untuk tiap kolom

In [41]:
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.100000
B    0.066667
C    0.000000
D    0.233333
dtype: float64

### Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan threshold

In [42]:
treshold = len(df) * 0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,B,C
0,900QXWwMcv,1.079881,-0.034375,0.885965
1,9CbqXzuwXX,-0.983803,-1.028873,0.189076
2,BFiW2SP7yL,1.087102,-0.664897,1.253896
3,wGILDW3bH5,-0.028798,0.081001,0.06655
4,GG9qvoxTNf,0.904447,0.587859,0.277012


## \#08: Memeriksa kesamaan antar dua buah kolom (Series) pada Data Frame

### Import Modules

In [43]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [44]:
data = {'A':[15, 15, 18, np.nan, 12], 
        'B':[15, 15, 18, np.nan, 12]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


### Mengenal Pandas Series

In [45]:
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [46]:
type(df['A'])

pandas.core.series.Series

In [47]:
type(df)

pandas.core.frame.DataFrame

### Memeriksa kesamaan dengan operator `==`

In [48]:
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

### Memeriksa kesamaan dengan method `equals()`

In [49]:
df['A'].equals(df['B'])

True

### Memeriksa kesamaan antar dua Data Frame

In [50]:
df1 = df.copy(deep=True)

df.equals(df1)

True

In [51]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


## \#09: Membagi Data Frame menjadi dua secara acak

### Import Modules

In [52]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [53]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,14,16,16,1,2
1,9,18,3,16,19
2,8,11,15,13,15
3,13,18,18,13,1
4,6,2,9,9,6
5,13,4,2,3,8
6,13,3,13,10,11
7,6,14,1,9,17
8,6,7,2,5,4
9,18,19,7,14,6


### Membagi Data Frame menjadi dua secara acak berdasarkan proporsi tertentu

In [54]:
df.shape

(10, 5)

In [55]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 shape: {df_1.shape}')
print(f'df_2 shape: {df_2.shape}')

df_1 shape: (7, 5)
df_2 shape: (3, 5)


In [56]:
df_1

Unnamed: 0,A,B,C,D,E
9,18,19,7,14,6
5,13,4,2,3,8
8,6,7,2,5,4
7,6,14,1,9,17
6,13,3,13,10,11
4,6,2,9,9,6
1,9,18,3,16,19


In [57]:
df_2

Unnamed: 0,A,B,C,D,E
0,14,16,16,1,2
2,8,11,15,13,15
3,13,18,18,13,1


## \#10: Mengganti nama (label) kolom pada Data Frame berdasarkan pola

### Import Modules

In [58]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [59]:
df = pd.read_csv('./data/titanicfull.csv')
df.columns = ['Pclass', 'Survival status', 'full Name', 'Sex  ', '  Age', 
              'Sib SP', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
df_backup = df.copy(deep=True)

df.head()

Unnamed: 0,Pclass,Survival status,full Name,Sex,Age,Sib SP,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Menggunakan lowercase untuk nama kolom dan mengganti spasi dengan `_`

In [60]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex__,__age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Memangkas kelebihan spasi pada nama kolom

In [61]:
df = df_backup.copy(deep=True)

df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


## \#11: Melakukan seleksi kolom dan baris pada Data Frame menggunakan `loc`

### Import Modules

In [62]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [63]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,14,2,5,10,3
1,14,12,4,3,7
2,13,2,18,10,9
3,13,6,3,6,10
4,4,7,3,13,19
5,6,19,11,7,2
6,5,11,10,4,12
7,4,15,15,9,16
8,19,17,7,19,9
9,17,2,14,17,17


### Seleksi kolom dan baris menggunakan `loc`

In [64]:
df.loc[[0,3,4], ['B','E']]

Unnamed: 0,B,E
0,2,3
3,6,10
4,7,19


### Seleksi baris dengan kondisi

In [65]:
df.loc[df['B']>10, ['B','D','E']]

Unnamed: 0,B,D,E
1,12,3,7
5,19,7,2
6,11,4,12
7,15,9,16
8,17,19,9


### Slicing Data Frame dengan `loc`

In [66]:
df.loc[0:4, 'B':'D']

Unnamed: 0,B,C,D
0,2,5,10
1,12,4,3
2,2,18,10
3,6,3,6
4,7,3,13


## \#12: Membentuk kolom bertipe `datetime` dari sejumlah kolom lain pada Data Frame

### Import Modules

In [67]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [68]:
data = {'day':[1, 2, 10 ,25, 12], 
        'month':[1, 2, 4, 5, 6], 
        'year':[2000, 2001, 2010, 2015, 2020]}

df = pd.DataFrame(data)
df

Unnamed: 0,day,month,year
0,1,1,2000
1,2,2,2001
2,10,4,2010
3,25,5,2015
4,12,6,2020


### Membentuk kolom bertipe `datetime`

In [69]:
df['penaggalan'] = pd.to_datetime(df[['day', 'month', 'year']])
df

Unnamed: 0,day,month,year,penaggalan
0,1,1,2000,2000-01-01
1,2,2,2001,2001-02-02
2,10,4,2010,2010-04-10
3,25,5,2015,2015-05-25
4,12,6,2020,2020-06-12


In [70]:
df.dtypes

day                    int64
month                  int64
year                   int64
penaggalan    datetime64[ns]
dtype: object

## \#13: Konversi nilai numerik ke dalam sejumlah kategori

### Import Modules

In [71]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [72]:
n_rows = 10
n_cols = 1
cols = ('usia',)

df = pd.DataFrame(np.random.randint(1, 99, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,usia
0,25
1,88
2,42
3,23
4,29
5,85
6,73
7,59
8,80
9,21


### Pengelompokkan nilai numerik ke dalam beberapa kategori menggunakan `cut()`

In [73]:
df['kelompok_usia'] = pd.cut(df['usia'], 
                             bins=[0, 18, 65, 99], 
                             labels=['anak', 'dewasa', 'manula'])
df

Unnamed: 0,usia,kelompok_usia
0,25,dewasa
1,88,manula
2,42,dewasa
3,23,dewasa
4,29,dewasa
5,85,manula
6,73,manula
7,59,dewasa
8,80,manula
9,21,dewasa


## \#14: Menggabungkan (merge) dua Data Frame

### Import Modules

In [74]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [75]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,7,6,7,8,15
1,12,15,18,12,15
2,13,11,5,14,5
3,10,1,11,1,17
4,12,19,4,12,1


In [76]:
df1 = df.copy(deep=True)
df1 = df1.drop([1, 4])
df1

Unnamed: 0,A,B,C,D,E
0,7,6,7,8,15
2,13,11,5,14,5
3,10,1,11,1,17


In [77]:
df2 = df.copy(deep=True)
df2 = df2.drop([0, 3])
df2

Unnamed: 0,A,B,C,D,E
1,12,15,18,12,15
2,13,11,5,14,5
4,12,19,4,12,1


### Menggabungkan dua Data Frame

In [78]:
df_inner = pd.merge(df1, df2, how='inner')
df_inner

Unnamed: 0,A,B,C,D,E
0,13,11,5,14,5


In [79]:
df_outer = pd.merge(df1, df2, how='outer')
df_outer

Unnamed: 0,A,B,C,D,E
0,7,6,7,8,15
1,13,11,5,14,5
2,10,1,11,1,17
3,12,15,18,12,15
4,12,19,4,12,1


## \#15: Memecah nilai string dari suatu kolom ke dalam beberapa kolom baru

### Import Modules

In [80]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [81]:
data = {'nama':['Didi Kempot', 'Glenn Fredly', 'Mbah Surip'], 
        'tempat_kelahiran':['Surakarta, Jawa Tengah', 'Jakarta, DKI Jakarta', 'Mojokerto, Jawa Timur']}
df = pd.DataFrame(data)
df

Unnamed: 0,nama,tempat_kelahiran
0,Didi Kempot,"Surakarta, Jawa Tengah"
1,Glenn Fredly,"Jakarta, DKI Jakarta"
2,Mbah Surip,"Mojokerto, Jawa Timur"


### Memecah nama depan dan nama belakang

In [82]:
df[['nama_depan', 'nama_belakang']] = df['nama'].str.split(' ', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang
0,Didi Kempot,"Surakarta, Jawa Tengah",Didi,Kempot
1,Glenn Fredly,"Jakarta, DKI Jakarta",Glenn,Fredly
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip


### Memecah nama kota dan propinsi

In [83]:
df[['kota', 'propinsi']] = df['tempat_kelahiran'].str.split(',', expand=True)
df

Unnamed: 0,nama,tempat_kelahiran,nama_depan,nama_belakang,kota,propinsi
0,Didi Kempot,"Surakarta, Jawa Tengah",Didi,Kempot,Surakarta,Jawa Tengah
1,Glenn Fredly,"Jakarta, DKI Jakarta",Glenn,Fredly,Jakarta,DKI Jakarta
2,Mbah Surip,"Mojokerto, Jawa Timur",Mbah,Surip,Mojokerto,Jawa Timur


## \#16: Menata ulang Data Frame dengan mutiple indexes menggunakan `unstack()`

### Import Modules

In [84]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [85]:
df = pd.read_csv('./data/titanicfull.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Data Frame dengan multiple indexes dari hasil groupping

In [86]:
df.groupby(['sex', 'pclass'])['survived'].mean().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.965278
female,2,0.886792
female,3,0.490741
male,1,0.340782
male,2,0.146199
male,3,0.15213


### Menata ulang Data Frame dengan mutiple indexes

In [87]:
df.groupby(['sex', 'pclass'])['survived'].mean().unstack()

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.965278,0.886792,0.490741
male,0.340782,0.146199,0.15213


## \#17: Resampling pada data deret waktu (time series data)

### Import Modules

In [88]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [89]:
n_rows = 365 * 24
n_cols = 2
cols = ['col1', 'col2']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)

df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')
df

Unnamed: 0,col1,col2
2000-01-01 00:00:00,1,7
2000-01-01 01:00:00,2,6
2000-01-01 02:00:00,11,3
2000-01-01 03:00:00,8,9
2000-01-01 04:00:00,11,17
...,...,...
2000-12-30 19:00:00,17,17
2000-12-30 20:00:00,16,10
2000-12-30 21:00:00,10,15
2000-12-30 22:00:00,3,17


### Resampling data dengan interval monthly

In [90]:
df.resample('M')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-31,7418
2000-02-29,6888
2000-03-31,7906
2000-04-30,7023
2000-05-31,7265
2000-06-30,7072
2000-07-31,7244
2000-08-31,7452
2000-09-30,7123
2000-10-31,7365


### Resampling data dengan interval daily

In [91]:
df.resample('D')['col1'].sum().to_frame()

Unnamed: 0,col1
2000-01-01,213
2000-01-02,238
2000-01-03,242
2000-01-04,203
2000-01-05,243
...,...
2000-12-26,260
2000-12-27,241
2000-12-28,215
2000-12-29,269


## \#18: Membentuk dummy Data Frame

### Import Modules

In [92]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Membentuk Data Frame dari Dictionary

In [93]:
pd.DataFrame({'col1':[1, 2, 3, 4], 
              'col2':[5, 6, 7, 8]})

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


### Membentuk Data Frame dari Numpy Array

In [94]:
n_rows = 5
n_cols = 3

arr = np.random.randint(1, 20, size=(n_rows, n_cols))
arr

array([[15,  5, 13],
       [ 5, 17,  9],
       [13, 16, 17],
       [ 8,  9, 15],
       [13,  3,  4]])

In [95]:
pd.DataFrame(arr, columns=tuple('ABC'))

Unnamed: 0,A,B,C
0,15,5,13
1,5,17,9
2,13,16,17
3,8,9,15
4,13,3,4


### Membentuk Data Frame dengan memanfaatkan  `pandas.util.testing`

In [96]:
pd.util.testing.makeDataFrame().head()

Unnamed: 0,A,B,C,D
znzNn4Jeas,-1.112513,-1.119364,-0.82674,-0.277271
qha1MbniZ6,-1.114907,-0.596305,-0.15557,-0.371433
ayTAkGdzIk,-0.475589,0.028404,0.716995,0.033821
mZW5g8qSPC,-0.595141,-0.156466,0.368502,0.284515
rky51M1deZ,-0.27602,0.991026,-0.615189,-1.705068


In [97]:
pd.util.testing.makeMixedDataFrame().head()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [98]:
pd.util.testing.makeTimeDataFrame().head()

Unnamed: 0,A,B,C,D
2000-01-03,-0.023092,-1.96119,-1.285398,-0.252391
2000-01-04,-0.524673,-0.224623,1.492113,0.246267
2000-01-05,0.885415,0.958625,-2.392396,-1.634856
2000-01-06,-0.56675,0.138019,-0.051015,0.562098
2000-01-07,0.721107,0.871064,-0.154296,-0.214893


In [99]:
pd.util.testing.makeMissingDataframe().head()

Unnamed: 0,A,B,C,D
9jo4giYGrC,-0.337615,0.840105,,-0.159069
HsLJkDDKEU,1.247511,0.702684,1.460156,0.966604
s39dfAFBKx,0.971425,1.113199,0.482289,0.214438
Zv5bA6xdB7,,-0.285285,-1.928174,0.895978
4MtbiVCB8S,1.116907,-0.717136,0.943138,1.233834
