![](./image/pandas-logo.png)

# Python Pandas: Tips & Tricks

Oleh Channel YouTube [Indonesia Belajar](https://www.youtube.com/IndonesiaBelajarKomputer)

## \#01: Menyertakan Prefix dan Suffix pada seluruh Kolom Data Frame

### Import Modules

In [None]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

### Persiapan Data Frame

In [None]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

In [None]:
tuple('ABCDE')

### Menyertakan Prefix Kolom

In [None]:
df.add_prefix('kolom_')

### Menyertakan Suffix Kolom

In [None]:
df.add_suffix('_field')

## \#02: Pemilihan baris (rows selection) pada Data Frame

### Import Modules

In [None]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

### Persiapan Data Frame

In [None]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 5, size=(n_rows, n_cols)), 
                  columns=cols)
df

### Selection dengan operator logika `|`

In [None]:
df[(df['A'] == 1) | (df['A'] == 3)]

### Selection dengan fungsi `isin()`

In [None]:
df[df['A'].isin([1, 3])]

### Mengenal operator negasi `~`

In [None]:
df[~df['A'].isin([1, 3])]

## \#03: Konversi tipe data String ke Numerik pada kolom Data Frame

### Import Modules

In [None]:
import pandas as pd

print(pd.__version__)

### Persiapan Data Frame

In [None]:
data = {'col1':['1', '2', '3', 'teks'], 
        'col2':['1', '2', '3', '4']}

df = pd.DataFrame(data)
df

In [None]:
df.dtypes

### Konversi tipe data dengan fungsi `astype()`

In [None]:
df_x = df.astype({'col2':'int'})
df_x

In [None]:
df_x.dtypes

### Konversi tipe data numerik dengan fungsi `to_numeric()`

In [None]:
df.apply(pd.to_numeric, errors='coerce')

## \#04: Pemilihan kolom (columns selection) pada Data Frame berdasarkan tipe data

### Import Modules

In [None]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

### Persiapan Data Frame

In [None]:
n_rows = 5
n_cols = 2
cols = ['bil_pecahan', 'bil_bulat']

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df['bil_pecahan'] = df['bil_pecahan'].astype('float')

df.index = pd.util.testing.makeDateIndex(n_rows, freq='H')
df = df.reset_index()

df['teks'] = list('ABCDE')

df

In [None]:
df.dtypes

### Memilih kolom bertipe data numerik

In [None]:
df.select_dtypes(include='number')

In [None]:
df.select_dtypes(include='float')

In [None]:
df.select_dtypes(include='int')

### Memilih kolom bertipe data string atau `object`

In [None]:
df.select_dtypes(include='object')

### Memilih kolom bertipe data `datetime`

In [None]:
df.select_dtypes(include='datetime')

### Memilih kolom dengan kombinasi tipe data

In [None]:
df.select_dtypes(include=['number', 'object'])

## \#05: Membalik urutan baris dan kolom pada Data Frame

### Import Modules

In [None]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

### Persiapan Data Frame

In [None]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

### Membalik urutan kolom

In [None]:
df.loc[:, ::-1]

### Membalik urutan baris

In [None]:
df.loc[::-1]

### Membalik urutan baris dan melakukan penyesuaian ulang `index`

In [None]:
df.loc[::-1].reset_index(drop=True)

## \#06: Mengganti nama (label) kolom pada Data Frame

### Import Modules

In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [2]:
n_rows = 5
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,4,1,3,3,2
1,2,7,9,7,5
2,8,3,4,8,4
3,2,7,2,8,4
4,2,5,1,5,4


### Mengganti nama (label) untuk sebuah kolom pada Data Frame

In [3]:
df.rename(columns={'C':'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,4,1,3,3,2
1,2,7,9,7,5
2,8,3,4,8,4
3,2,7,2,8,4
4,2,5,1,5,4


### Mengganti nama (label) untuk banyak kolom pada Data Frame

In [4]:
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,4,1,3,3,2
1,2,7,9,7,5
2,8,3,4,8,4
3,2,7,2,8,4
4,2,5,1,5,4


## \#07: Menghapus (drop) missing values (`NaN`)

### Import Modules

In [1]:
import pandas as pd

print(pd.__version__)

1.0.3


### Persiapan Data Frame

In [9]:
df = pd.util.testing.makeMissingDataframe().reset_index() 
df.head()

Unnamed: 0,index,A,B,C,D
0,DVsnREfwqA,1.04355,-0.382004,-0.017651,1.290818
1,eD2HAwfH2G,,0.698998,1.387639,
2,ZAr0UQrZCQ,-0.03948,-0.598128,-1.128054,1.658062
3,q0nhFGrHzq,-0.638876,-0.958693,-0.104764,0.854686
4,YME0TBmNCd,0.289891,1.592017,-1.212892,0.899887


In [10]:
df = df.rename(columns={'index':'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,DVsnREfwqA,1.04355,-0.382004,-0.017651,1.290818
1,eD2HAwfH2G,,0.698998,1.387639,
2,ZAr0UQrZCQ,-0.03948,-0.598128,-1.128054,1.658062
3,q0nhFGrHzq,-0.638876,-0.958693,-0.104764,0.854686
4,YME0TBmNCd,0.289891,1.592017,-1.212892,0.899887


In [11]:
df_backup = df.copy(deep=True)

### Menghapus (drop) setiap kolom yang mengandung missing values

In [12]:
df = df.dropna(axis='columns') 
df.head()

Unnamed: 0,Z
0,DVsnREfwqA
1,eD2HAwfH2G
2,ZAr0UQrZCQ
3,q0nhFGrHzq
4,YME0TBmNCd


### Menghapus (drop) setiap baris yang mengandung missing values

In [13]:
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
0,DVsnREfwqA,1.04355,-0.382004,-0.017651,1.290818
2,ZAr0UQrZCQ,-0.03948,-0.598128,-1.128054,1.658062
3,q0nhFGrHzq,-0.638876,-0.958693,-0.104764,0.854686
4,YME0TBmNCd,0.289891,1.592017,-1.212892,0.899887
6,82xl4dRYhC,-0.027386,-1.085311,-1.401576,0.306863


### Persentase missing values untuk tiap kolom

In [14]:
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.133333
C    0.066667
D    0.133333
dtype: float64

### Menghapus (drop) setiap kolom yang mengandung missing values berdasarkan threshold

In [15]:
treshold = len(df) * 0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,C
0,DVsnREfwqA,1.04355,-0.017651
1,eD2HAwfH2G,,1.387639
2,ZAr0UQrZCQ,-0.03948,-1.128054
3,q0nhFGrHzq,-0.638876,-0.104764
4,YME0TBmNCd,0.289891,-1.212892


## \#08: Memeriksa kesamaan antar dua buah kolom (Series) pada Data Frame

### Import Modules

In [16]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [20]:
data = {'A':[15, 15, 18, np.nan, 12], 
        'B':[15, 15, 18, np.nan, 12]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


### Mengenal Pandas Series

In [22]:
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [23]:
type(df['A'])

pandas.core.series.Series

In [24]:
type(df)

pandas.core.frame.DataFrame

### Memeriksa kesamaan dengan operator `==`

In [32]:
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

### Memeriksa kesamaan dengan method `equals()`

In [27]:
df['A'].equals(df['B'])

True

### Memeriksa kesamaan antar dua Data Frame

In [30]:
df1 = df.copy(deep=True)

df.equals(df1)

True

In [31]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


## \#09: Membagi Data Frame menjadi dua secara acak

### Import Modules

In [37]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

1.0.3
1.18.1


### Persiapan Data Frame

In [36]:
n_rows = 10
n_cols = 5
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(n_rows, n_cols)), 
                  columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,14,14,14,3,9
1,2,5,19,9,11
2,18,5,17,3,7
3,9,7,11,7,10
4,13,14,16,1,14
5,14,8,4,11,2
6,11,14,3,16,7
7,13,6,10,5,4
8,14,1,17,17,7
9,3,18,9,17,18


### Membagi Data Frame menjadi dua secara acak berdasarkan proporsi tertentu

In [38]:
df.shape

(10, 5)

In [47]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 shape: {df_1.shape}')
print(f'df_2 shape: {df_2.shape}')

df_1 shape: (7, 5)
df_2 shape: (3, 5)


In [48]:
df_1

Unnamed: 0,A,B,C,D,E
1,2,5,19,9,11
2,18,5,17,3,7
7,13,6,10,5,4
3,9,7,11,7,10
6,11,14,3,16,7
0,14,14,14,3,9
4,13,14,16,1,14


In [49]:
df_2

Unnamed: 0,A,B,C,D,E
5,14,8,4,11,2
8,14,1,17,17,7
9,3,18,9,17,18
