# Tips & Trick Python Pandas

## 1. Prefix dan Suffix pada Kolom Data Frame

### Import Module

In [6]:
import pandas as pd
import numpy as np

In [7]:
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(5, 5)),
                 columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,8,7,6,7,3
1,9,6,9,5,8
2,8,2,5,1,8
3,2,4,2,7,4
4,4,6,5,1,4


### Menyertakan Prefix

In [8]:
df.add_prefix('kolom_')

Unnamed: 0,kolom_A,kolom_B,kolom_C,kolom_D,kolom_E
0,8,7,6,7,3
1,9,6,9,5,8
2,8,2,5,1,8
3,2,4,2,7,4
4,4,6,5,1,4


### Menyertakan Suffix

In [12]:
df.add_suffix('_field')

Unnamed: 0,A_field,B_field,C_field,D_field,E_field
0,8,7,6,7,3
1,9,6,9,5,8
2,8,2,5,1,8
3,2,4,2,7,4
4,4,6,5,1,4


## 2. Pemilihan baris (row selection) pada Data Frame

### Import Module

In [13]:
import pandas as pd
import numpy as np

In [14]:
cols = tuple('ABCDE')
df = pd.DataFrame(np.random.randint(1, 5, size=(10, 5)),
                 columns = cols)
df

Unnamed: 0,A,B,C,D,E
0,4,4,4,3,2
1,2,2,3,2,3
2,2,2,1,2,4
3,4,2,4,2,4
4,4,4,3,4,4
5,4,2,1,1,3
6,4,4,4,3,4
7,1,3,4,1,1
8,2,3,2,4,1
9,2,1,1,3,3


### Selection dengan operator logika |

In [15]:
df[(df['A'] == 1) | (df['A'] == 3)]

Unnamed: 0,A,B,C,D,E
7,1,3,4,1,1


### Selection dengan fungsi .isin()

In [16]:
df[(df['A'].isin([1, 3]))]

Unnamed: 0,A,B,C,D,E
7,1,3,4,1,1


### Mengenal Operator negasi ~

In [17]:
df[~df['A'].isin([1,3])]

Unnamed: 0,A,B,C,D,E
0,4,4,4,3,2
1,2,2,3,2,3
2,2,2,1,2,4
3,4,2,4,2,4
4,4,4,3,4,4
5,4,2,1,1,3
6,4,4,4,3,4
8,2,3,2,4,1
9,2,1,1,3,3


## 3. Konversi tipe data String ke Numerik pada kolom Data Frame

### Import Module

In [18]:
import pandas as pd

### Persiapan Data Frame

In [19]:
data = {'col1':['1','2','3','teks'],
       'cols2':['1','2','3','4']}

df = pd.DataFrame(data)
df

Unnamed: 0,col1,cols2
0,1,1
1,2,2
2,3,3
3,teks,4


In [21]:
df.dtypes

col1     object
cols2    object
dtype: object

### Konversi tipe data dengan fungsi .astype()

In [24]:
df_x = df.astype({'cols2':'int'})
df_x

Unnamed: 0,col1,cols2
0,1,1
1,2,2
2,3,3
3,teks,4


In [25]:
df_x.dtypes

col1     object
cols2     int32
dtype: object

### Konversi tipe data numerik dengan fungsi to_numeric()

In [26]:
df.apply(pd.to_numeric, errors='coerce')

Unnamed: 0,col1,cols2
0,1.0,1
1,2.0,2
2,3.0,3
3,,4


## 4. Pemilihan kolom (Columns Selection) pada Data Frame berdasarkan tipe data

### Import Module

In [27]:
import pandas as pd
import numpy as np

### Persiapan Data Frame

In [28]:
cols = ['bil_pecahan', 'bil_bulat']

df = pd.DataFrame(np.random.randint(1, 20, size=(5, 2)),
                 columns=cols)
df['bil_pecahan'] = df['bil_pecahan'].astype('float')

df.index = pd.util.testing.makeDateIndex(5, freq='H')
df = df.reset_index()

df['teks'] = list('ABCDE')

df

  import pandas.util.testing


Unnamed: 0,index,bil_pecahan,bil_bulat,teks
0,2000-01-01 00:00:00,11.0,4,A
1,2000-01-01 01:00:00,11.0,11,B
2,2000-01-01 02:00:00,5.0,15,C
3,2000-01-01 03:00:00,10.0,16,D
4,2000-01-01 04:00:00,14.0,5,E


In [29]:
df.dtypes

index          datetime64[ns]
bil_pecahan           float64
bil_bulat               int32
teks                   object
dtype: object

### Memilih kolom bertipe data numerik

In [30]:
df.select_dtypes(include='number')

Unnamed: 0,bil_pecahan,bil_bulat
0,11.0,4
1,11.0,11
2,5.0,15
3,10.0,16
4,14.0,5


In [31]:
df.select_dtypes(include='float')

Unnamed: 0,bil_pecahan
0,11.0
1,11.0
2,5.0
3,10.0
4,14.0


In [32]:
df.select_dtypes(include='int')

Unnamed: 0,bil_bulat
0,4
1,11
2,15
3,16
4,5


### Memilih kolom bertipe data string atau object

In [33]:
df.select_dtypes(include='object')

Unnamed: 0,teks
0,A
1,B
2,C
3,D
4,E


### Memilih kolom bertipe data datetime

In [34]:
df.select_dtypes(include='datetime')

Unnamed: 0,index
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00
4,2000-01-01 04:00:00


### Memilih kolom dengan kombinasi tipe data

In [35]:
df.select_dtypes(include=['number', 'object'])

Unnamed: 0,bil_pecahan,bil_bulat,teks
0,11.0,4,A
1,11.0,11,B
2,5.0,15,C
3,10.0,16,D
4,14.0,5,E


## 5. Membalik urutan baris dan kolom pada Data Frame

### Import Module

In [36]:
import pandas as pd
import numpy as np

### Persiapan Data Frame

In [38]:
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(5, 5)),
                 columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,6,2,2,1,7
1,2,4,5,7,1
2,8,2,1,7,3
3,2,9,7,5,5
4,8,6,6,2,5


### Membalik urutan kolom

In [39]:
df.loc[:, ::-1]

Unnamed: 0,E,D,C,B,A
0,7,1,2,2,6
1,1,7,5,4,2
2,3,7,1,2,8
3,5,5,7,9,2
4,5,2,6,6,8


### Membalik urutan baris

In [40]:
df.loc[::-1]

Unnamed: 0,A,B,C,D,E
4,8,6,6,2,5
3,2,9,7,5,5
2,8,2,1,7,3
1,2,4,5,7,1
0,6,2,2,1,7


### Membalik urutan baris dan melakukan penyesuaian ulang index

In [41]:
df.loc[::-1].reset_index(drop=True)

Unnamed: 0,A,B,C,D,E
0,8,6,6,2,5
1,2,9,7,5,5
2,8,2,1,7,3
3,2,4,5,7,1
4,6,2,2,1,7


## 6. Mengganti nama (label) kolom pada Data Frame

### Import Module

In [42]:
import pandas as pd
import numpy as np

### Persiapan Data Frame

In [43]:
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 10, size=(5, 5)),
                 columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,1,8,3,7,7
1,9,8,9,5,2
2,3,7,4,2,2
3,4,6,2,1,6
4,9,5,2,7,5


### Mengganti nama (label) untuk sebuah kolom pada Data Frame

In [44]:
df.rename(columns={'C':'Hobi'})

Unnamed: 0,A,B,Hobi,D,E
0,1,8,3,7,7
1,9,8,9,5,2
2,3,7,4,2,2
3,4,6,2,1,6
4,9,5,2,7,5


### Mengganti nama (label) untuk banyak kolom pada Data Frame

In [45]:
df.rename(columns={'A':'Nama', 'B':'Alamat', 'D':'Kota'})

Unnamed: 0,Nama,Alamat,C,Kota,E
0,1,8,3,7,7
1,9,8,9,5,2
2,3,7,4,2,2
3,4,6,2,1,6
4,9,5,2,7,5


## 7. Menghapus Missing Value pada Data Frame

### Import Module

In [46]:
import pandas as pd

### Persiapan Data Frame

In [48]:
df = pd.util.testing.makeMissingDataframe().reset_index()
df.head()

Unnamed: 0,index,A,B,C,D
0,7Xu3zMS06S,-0.186821,-0.170065,0.053049,0.562921
1,qN58PlNUMm,0.973759,,-2.170181,1.650703
2,tmj7QlJhXs,-0.437723,-1.870445,-0.640038,0.123894
3,pyCLrsQzAV,,-0.34237,-1.126662,0.267317
4,HEJrXyXwyD,0.390308,-1.157776,-0.634828,0.266476


In [49]:
df = df.rename(columns={'index':'Z'})
df.head()

Unnamed: 0,Z,A,B,C,D
0,7Xu3zMS06S,-0.186821,-0.170065,0.053049,0.562921
1,qN58PlNUMm,0.973759,,-2.170181,1.650703
2,tmj7QlJhXs,-0.437723,-1.870445,-0.640038,0.123894
3,pyCLrsQzAV,,-0.34237,-1.126662,0.267317
4,HEJrXyXwyD,0.390308,-1.157776,-0.634828,0.266476


In [50]:
df_backup = df.copy(deep=True)

### Menghapus (drop) setiap kolom yang mengandung Missing Value

In [51]:
df = df.dropna(axis='columns')
df.head()

Unnamed: 0,Z
0,7Xu3zMS06S
1,qN58PlNUMm
2,tmj7QlJhXs
3,pyCLrsQzAV
4,HEJrXyXwyD


### Menghapus (drop) setiap baris yang mengandung Missing Value

In [52]:
df = df_backup.copy(deep=True)
df = df.dropna(axis='rows')
df.head()

Unnamed: 0,Z,A,B,C,D
0,7Xu3zMS06S,-0.186821,-0.170065,0.053049,0.562921
2,tmj7QlJhXs,-0.437723,-1.870445,-0.640038,0.123894
4,HEJrXyXwyD,0.390308,-1.157776,-0.634828,0.266476
5,9zANmI6zzQ,1.47045,-0.01647,-0.108545,0.615694
6,A8PxO4Puga,-0.643421,1.607102,0.015956,-3.455305


### Persentase Missing Value untuk setiap kolom

In [53]:
df = df_backup.copy(deep=True)
df.isna().mean()

Z    0.000000
A    0.066667
B    0.066667
C    0.066667
D    0.200000
dtype: float64

### Menghapus (drop) setiap kolom yang mengandung Missing Value berdasarkan Threshold

In [55]:
treshold = len(df) * 0.9
df = df.dropna(thresh=treshold, axis='columns')
df.head()

Unnamed: 0,Z,A,B,C
0,7Xu3zMS06S,-0.186821,-0.170065,0.053049
1,qN58PlNUMm,0.973759,,-2.170181
2,tmj7QlJhXs,-0.437723,-1.870445,-0.640038
3,pyCLrsQzAV,,-0.34237,-1.126662
4,HEJrXyXwyD,0.390308,-1.157776,-0.634828


## 8. Memeriksa Kesamaan antar dua kolom (Series) pada Data Frame

### Import Module

In [56]:
import pandas as pd
import numpy as np

### Persiapan Data Frame

In [57]:
data = {'A':[15, 15, 18, np.nan, 12],
       'B':[15, 15, 18, np.nan, 12]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,15.0,15.0
1,15.0,15.0
2,18.0,18.0
3,,
4,12.0,12.0


### Mengenal Panda Series

In [58]:
df['A']

0    15.0
1    15.0
2    18.0
3     NaN
4    12.0
Name: A, dtype: float64

In [59]:
type(df['A'])

pandas.core.series.Series

In [60]:
type(df)

pandas.core.frame.DataFrame

###  Memeriksa kesamaan dengan operator ==

In [61]:
df['A'] == df['B']

0     True
1     True
2     True
3    False
4     True
dtype: bool

### Memeriksa kesamaan dengan method .equals()

In [62]:
df['A'].equals(df['B'])

True

### Memeriksa kesamaan antar dua Data Frame

In [63]:
df1 = df.copy(deep=True)
df.equals(df1)

True

In [64]:
df == df1

Unnamed: 0,A,B
0,True,True
1,True,True
2,True,True
3,False,False
4,True,True


## 9. Membagi Data Frame menjadi dua secara acak

### Import Module

In [65]:
import pandas as pd
import numpy as np

### Persiapan Data Frame

In [67]:
cols = tuple('ABCDE')

df = pd.DataFrame(np.random.randint(1, 20, size=(10, 5)),
                 columns=cols)
df

Unnamed: 0,A,B,C,D,E
0,5,5,12,6,9
1,7,7,11,11,17
2,19,11,4,10,7
3,2,1,1,13,13
4,14,10,11,15,7
5,18,11,19,7,1
6,18,10,15,12,18
7,17,14,6,9,15
8,19,4,3,15,9
9,10,13,12,11,10


### Membagi Data Frame menjadi dua secara acak berdasarkan proporsi tertentu

In [68]:
df.shape

(10, 5)

In [70]:
proporsi = 0.7
df_1 = df.sample(frac=proporsi)
df_2 = df.drop(df_1.index)

print(f'df_1 shape: {df_1.shape}')
print(f'df_2 shape: {df_2.shape}')

df_1 shape: (7, 5)
df_2 shape: (3, 5)


In [71]:
df_1

Unnamed: 0,A,B,C,D,E
7,17,14,6,9,15
2,19,11,4,10,7
6,18,10,15,12,18
9,10,13,12,11,10
4,14,10,11,15,7
8,19,4,3,15,9
3,2,1,1,13,13


In [72]:
df_2

Unnamed: 0,A,B,C,D,E
0,5,5,12,6,9
1,7,7,11,11,17
5,18,11,19,7,1


## 10. Mengganti nama (label) kolom pada Data Frame berdasarkan pola

### Import Module

In [73]:
import pandas as pd

### Persiapan Data Frame

In [75]:
df = pd.read_csv('titanic.csv')
df.columns = ['Pclass', 'Survival status','full name','Sex ',' Age',
             'Sib SP','Parch','Ticket','Fare','Cabin','Embarked']
df_backup = df.copy(deep=True)

df.head()

Unnamed: 0,Pclass,Survival status,full name,Sex,Age,Sib SP,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Menggunakan lowercase untuk nama kolom dan mengganti spasi dengan _

In [76]:
df.columns = df.columns.str.replace(' ','_').str.lower()
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex_,_age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


### Memangkas kelebihan spasi pada nama kolom

In [77]:
df = df_backup.copy(deep=True)

df.columns = df.columns.str.lower().str.strip().str.replace(' ','_')
df.head()

Unnamed: 0,pclass,survival_status,full_name,sex,age,sib_sp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S
