`November 17, 2022`

### **Pandas Manipulation**

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame(data=np.random.randint(10, 100, (5, 4)),
                  columns='A B C D'.split()
)

df

Unnamed: 0,A,B,C,D
0,77,21,55,77
1,54,93,66,97
2,43,45,85,35
3,65,16,93,11
4,36,60,76,94


**Adding Data**

    Membuat baris baru

In [None]:
# Untuk penambahan baris baru, jika data yang sudah ada bertipe int dan ditambahkan data
    # yang bertipe float, maka semua data akan dikonversi menjadi float.

# Sebaliknya, jika data yang sudah ada bertipe float dan ditambahkan data yang bertipe
    # int, maka data yang baru akan dikonversi menjadi float. 
    
df.loc['new'] = [1.2, 3.3, 45.6, 10.2]
df

Unnamed: 0,A,B,C,D
0,47.0,31.0,71.0,66.0
1,85.0,56.0,12.0,74.0
2,56.0,51.0,71.0,71.0
3,82.0,74.0,61.0,43.0
4,39.0,33.0,57.0,59.0
new,1.2,3.3,45.6,10.2


    Membuat kolom baru

In [None]:
# Pada penambahan kolom baru, tipe angka tidak akan berubah meskipun kita memiliki integer dan float. 

df['E'] = [1.2, 3.3, 45.6, 10.2, 98.7]
df

Unnamed: 0,A,B,C,D,E
0,72,33,34,74,1.2
1,70,98,45,14,3.3
2,29,59,60,96,45.6
3,69,12,37,81,10.2
4,64,13,86,31,98.7


In [None]:
# Menyelipkan kolom baru pada index tertentu dengan menggunakan .insert()
df.insert(0, 'New Col', [10, 20, 30, 40, 50])
df

Unnamed: 0,New Col,A,B,C,D,E
0,10,72,33,34,74,1.2
1,20,70,98,45,14,3.3
2,30,29,59,60,96,45.6
3,40,69,12,37,81,10.2
4,50,64,13,86,31,98.7


**Removing Data**

    Menghapus kolom

In [None]:
# Hapus kolom hanya pada cell ini (tidak permanen)
# .drop() sifatnya tidak inpalce/permanen
# Kolom --> axis = 1

df.drop('New Col', axis=1)

Unnamed: 0,A,B,C,D,E
0,72,33,34,74,1.2
1,70,98,45,14,3.3
2,29,59,60,96,45.6
3,69,12,37,81,10.2
4,64,13,86,31,98.7


In [None]:
# Kalau mau permanen, tambahkan parameter inplace=True
df.drop('New Col', axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D,E
0,72,33,34,74,1.2
1,70,98,45,14,3.3
2,29,59,60,96,45.6
3,69,12,37,81,10.2
4,64,13,86,31,98.7


In [None]:
# Alternatif jika tidak menggunakan inplace=True adalah dengan menimpa ke variabel yang sama
df = df.drop('E', axis=1)
df

Unnamed: 0,A,B,C,D
0,72,33,34,74
1,70,98,45,14
2,29,59,60,96
3,69,12,37,81
4,64,13,86,31


    Menghapus baris

In [None]:
# Hapus baris secara tidak permanen
# Baris --> axis = 0

df.drop(4, axis=0)

Unnamed: 0,A,B,C,D
0,72,33,34,74
1,70,98,45,14
2,29,59,60,96
3,69,12,37,81


In [None]:
# Kalau mau permanen, tambahkan parameter inplace=True
    # atau timpa ke variabel yang sama

df.drop(4, axis=0, inplace=True)
df

Unnamed: 0,A,B,C,D
0,77,21,55,77
1,54,93,66,97
2,43,45,85,35
3,65,16,93,11


#### **Index & Multi-Index**

In [None]:
# Menjadikan suatu kolom sebagai index menggunakan set_index()
df_new = df.copy()
df_new.set_index('A', inplace=True)
df_new

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
77,21,55,77
54,93,66,97
43,45,85,35
65,16,93,11


In [None]:
# Mengembalikan index ke nilai default menggunakan reset_index()
df_new.reset_index(inplace=True)
df_new

Unnamed: 0,A,B,C,D
0,77,21,55,77
1,54,93,66,97
2,43,45,85,35
3,65,16,93,11


In [None]:
# Mengganti nama kolom
df_new.rename(columns={'A':'X'}, inplace=True)
df_new

Unnamed: 0,X,B,C,D
0,77,21,55,77
1,54,93,66,97
2,43,45,85,35
3,65,16,93,11


In [None]:
# Jika ingin mengubah beberapa nama kolom sekaligus
df_new.rename(columns={'B':'Y', 'C':'Z'}, inplace=True)
df_new

Unnamed: 0,X,Y,Z,D
0,77,21,55,77
1,54,93,66,97
2,43,45,85,35
3,65,16,93,11


    Multi-Index

In [None]:
outside = ['Jakarta', 'Jakarta', 'Jakarta', 'Bandung', 'Bandung', 'Bandung']
inside = [1, 2, 3, 1, 2, 3]

hier_index = list(zip(outside, inside))
hier_index

[('Jakarta', 1),
 ('Jakarta', 2),
 ('Jakarta', 3),
 ('Bandung', 1),
 ('Bandung', 2),
 ('Bandung', 3)]

In [None]:
type(hier_index)

list

In [None]:
# Konversi menjadi multi-index
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('Jakarta', 1),
            ('Jakarta', 2),
            ('Jakarta', 3),
            ('Bandung', 1),
            ('Bandung', 2),
            ('Bandung', 3)],
           )

In [None]:
type(hier_index)

pandas.core.indexes.multi.MultiIndex

In [None]:
# Buat array 6x2 untuk mengisi DataFrame
np.random.seed(1) # Digunakan agar random numbers tidak berubah ketika cell di-run ulang. Isinya bisa angka int berapapun.
values = np.random.randint(30, 100, (6, 2))
values

array([[67, 42],
       [39, 35],
       [94, 46],
       [31, 36],
       [55, 80],
       [50, 48]])

In [None]:
# Buat DataFrame berisi Multi-Index
df_multi = pd.DataFrame(data=values, 
                        index=hier_index, 
                        columns=['Restaurant A', 'Restaurant B'])

df_multi

Unnamed: 0,Unnamed: 1,Restaurant A,Restaurant B
Jakarta,1,67,42
Jakarta,2,39,35
Jakarta,3,94,46
Bandung,1,31,36
Bandung,2,55,80
Bandung,3,50,48


In [None]:
# Indexing kolom
df_multi[['Restaurant A']]

Unnamed: 0,Unnamed: 1,Restaurant A
Jakarta,1,67
Jakarta,2,39
Jakarta,3,94
Bandung,1,31
Bandung,2,55
Bandung,3,50


In [None]:
# Cara lain dengan menggunakan loc
df_multi.loc[:, ['Restaurant A']]

Unnamed: 0,Unnamed: 1,Restaurant A
Jakarta,1,67
Jakarta,2,39
Jakarta,3,94
Bandung,1,31
Bandung,2,55
Bandung,3,50


In [None]:
# Indexing baris
df_multi.loc['Bandung']

Unnamed: 0,Restaurant A,Restaurant B
1,31,36
2,55,80
3,50,48


In [None]:
df_multi.iloc[0:4]

Unnamed: 0,Unnamed: 1,Restaurant A,Restaurant B
Jakarta,1,67,42
Jakarta,2,39,35
Jakarta,3,94,46
Bandung,1,31,36


In [None]:
df_multi.loc['Bandung', ['Restaurant A']]

Unnamed: 0,Restaurant A
1,31
2,55
3,50


In [None]:
# Indexing dengan menggunakan .xs / cross-section
# Lebih mudah untuk mengakses data pada DataFrame yang mengandung multi-index

df_multi.xs(('Bandung', 2))

Restaurant A    55
Restaurant B    80
Name: (Bandung, 2), dtype: int32

In [None]:
# Jika ingin mengakses 1 nilai saja. 
# Contoh: Mengambil data restaurant A location 3 di Jakarta
df_multi['Restaurant A'][2]

94

In [None]:
# Memanggil nama index baris
df_multi.index.names

FrozenList([None, None])

In [None]:
# Memberikan nama untuk tiap index
df_multi.index.names = ['City', 'Location']
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,Restaurant A,Restaurant B
City,Location,Unnamed: 2_level_1,Unnamed: 3_level_1
Jakarta,1,67,42
Jakarta,2,39,35
Jakarta,3,94,46
Bandung,1,31,36
Bandung,2,55,80
Bandung,3,50,48


In [None]:
df_multi.index.names

FrozenList(['City', 'Location'])

In [None]:
# Indexing location 3 untuk tiap kota
df_multi.xs(3, level='Location')

Unnamed: 0_level_0,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Jakarta,94,46
Bandung,50,48


In [None]:
# Cara lain dengan menggunakan column index number untuk parameter level
df_multi.xs(3, level=1)

Unnamed: 0_level_0,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Jakarta,94,46
Bandung,50,48


#### **Sorting**

In [None]:
# Sifatnya tidak permanen
df_multi.reset_index(inplace=True)
df_multi

Unnamed: 0,City,Location,Restaurant A,Restaurant B
0,Jakarta,1,67,42
1,Jakarta,2,39,35
2,Jakarta,3,94,46
3,Bandung,1,31,36
4,Bandung,2,55,80
5,Bandung,3,50,48


In [None]:
df_multi.set_index('City', inplace=True)
df_multi

Unnamed: 0_level_0,Location,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jakarta,1,67,42
Jakarta,2,39,35
Jakarta,3,94,46
Bandung,1,31,36
Bandung,2,55,80
Bandung,3,50,48


In [None]:
# By default, ascending = True
df_multi.sort_values(by=['Restaurant A'])

Unnamed: 0_level_0,Location,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bandung,1,31,36
Jakarta,2,39,35
Bandung,3,50,48
Bandung,2,55,80
Jakarta,1,67,42
Jakarta,3,94,46


In [None]:
# Dari paling besar ke terkecil
df_multi.sort_values(by=['Restaurant A'], ascending=False)

Unnamed: 0_level_0,Location,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jakarta,3,94,46
Jakarta,1,67,42
Bandung,2,55,80
Bandung,3,50,48
Jakarta,2,39,35
Bandung,1,31,36


In [None]:
# Mengurutkan berdasarkan lebih dari 1 kolom
# Sorting berdasarkan kolom Restaurant A terlebih dahulu, baru berdasarkan Restaurant B
df_multi.sort_values(by=['Restaurant A', 'Restaurant B'], ascending=[True, False])

Unnamed: 0_level_0,Location,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bandung,1,31,36
Jakarta,2,39,35
Bandung,3,50,48
Bandung,2,55,80
Jakarta,1,67,42
Jakarta,3,94,46


In [None]:
# Sorting berdasarkan index
df_multi.sort_index()

Unnamed: 0_level_0,Location,Restaurant A,Restaurant B
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bandung,1,31,36
Bandung,2,55,80
Bandung,3,50,48
Jakarta,1,67,42
Jakarta,2,39,35
Jakarta,3,94,46


## **Pandas Functionality**

Explore DataFrame
- .head()
- .tail()
- .info()
- .shape
- .columns
- .dtypes

Descriptive Statistics
- .describe()
- .min() .max() .mean() .median() .mode() .std()
- .unique() .nunique() .value_counts()

In [None]:
# Load dataset untuk file .csv

# Kalau file yang mau di-import berada di folder yang berbeda dengan file Jupyter Notebook
    # yang sedang digunakan, tambahkan r sebelum mendefinisikan address di mana file csv
    # tersebut berada.

df = pd.read_csv(r'D:\Salaries.csv')
df.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [None]:
# Kalau file yang mau di-import berada di folder yang sama dengan file Jupyter Notebook
    # yang sedang digunakan, mengimpor filenya bisa langsung seperti cara di bawah ini.
df = pd.read_csv('Salaries.csv')
df.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


### **Basic Functionality**

In [None]:
# Menampilkan 1 baris awal pada dataset
# .head() & .tail() secara default akan mengembalikan 5 baris.
    # Namun, kita bisa mengatur jumlah baris yang ingin ditampilkan
df.head(1)

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,


In [None]:
# Mengecek baris terakhir pada dataset
df.tail(1)

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
148653,148654,Joe Lopez,"Counselor, Log Cabin Ranch",0.0,0.0,-618.13,0.0,-618.13,-618.13,2014,,San Francisco,


In [None]:
# Menampilkan informasi semua kolom
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148654 non-null  int64  
 1   EmployeeName      148654 non-null  object 
 2   JobTitle          148654 non-null  object 
 3   BasePay           148045 non-null  float64
 4   OvertimePay       148650 non-null  float64
 5   OtherPay          148650 non-null  float64
 6   Benefits          112491 non-null  float64
 7   TotalPay          148654 non-null  float64
 8   TotalPayBenefits  148654 non-null  float64
 9   Year              148654 non-null  int64  
 10  Notes             0 non-null       float64
 11  Agency            148654 non-null  object 
 12  Status            0 non-null       float64
dtypes: float64(8), int64(2), object(3)
memory usage: 14.7+ MB


In [None]:
# Mengecek jumlah missing value pada tiap kolom
# Bisa menggunakan .isna() atau .isnull()
df.isna().sum()

Id                       0
EmployeeName             0
JobTitle                 0
BasePay                609
OvertimePay              4
OtherPay                 4
Benefits             36163
TotalPay                 0
TotalPayBenefits         0
Year                     0
Notes               148654
Agency                   0
Status              148654
dtype: int64

In [None]:
# Alternatif 
df.isnull().sum()

Id                       0
EmployeeName             0
JobTitle                 0
BasePay                609
OvertimePay              4
OtherPay                 4
Benefits             36163
TotalPay                 0
TotalPayBenefits         0
Year                     0
Notes               148654
Agency                   0
Status              148654
dtype: int64

In [None]:
# Melihat ukuran DataFrame (baris, kolom)
df.shape

(148654, 13)

In [None]:
# Mengecek nama kolom
df.columns

Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',
       'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',
       'Status'],
      dtype='object')

In [None]:
# Mengecek tipe data tiap kolom
df.dtypes

Id                    int64
EmployeeName         object
JobTitle             object
BasePay             float64
OvertimePay         float64
OtherPay            float64
Benefits            float64
TotalPay            float64
TotalPayBenefits    float64
Year                  int64
Notes               float64
Agency               object
Status              float64
dtype: object

### **Descriptive Statistics**

In [None]:
# Menampilkan quick summary descriptive stats
# By default akan menampilkan kolom numerikal saja
df.describe()

Unnamed: 0,Id,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Status
count,148654.0,148045.0,148650.0,148650.0,112491.0,148654.0,148654.0,148654.0,0.0,0.0
mean,74327.5,66325.448841,5066.059886,3648.767297,25007.893151,74768.321972,93692.554811,2012.522643,,
std,42912.857795,42764.635495,11454.380559,8056.601866,15402.215858,50517.005274,62793.533483,1.117538,,
min,1.0,-166.01,-0.01,-7058.59,-33.89,-618.13,-618.13,2011.0,,
25%,37164.25,33588.2,0.0,0.0,11535.395,36168.995,44065.65,2012.0,,
50%,74327.5,65007.45,0.0,811.27,28628.62,71426.61,92404.09,2013.0,,
75%,111490.75,94691.05,4658.175,4236.065,35566.855,105839.135,132876.45,2014.0,,
max,148654.0,319275.01,245131.88,400184.25,96570.66,567595.43,567595.43,2014.0,,


In [None]:
# Jika ingin mengecek descriptive stats kolom kategorikal,
    # tambahkan parameter include = 'object'

df.describe(include='object')

# freq = jumlah top value muncul di suatu kolom

Unnamed: 0,EmployeeName,JobTitle,Agency
count,148654,148654,148654
unique,110811,2159,1
top,Kevin Lee,Transit Operator,San Francisco
freq,13,7036,148654


In [None]:
# Jika ingin menampilkan semua jenis kolom sekaligus
df.describe(include='all')

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
count,148654.0,148654,148654,148045.0,148650.0,148650.0,112491.0,148654.0,148654.0,148654.0,0.0,148654,0.0
unique,,110811,2159,,,,,,,,,1,
top,,Kevin Lee,Transit Operator,,,,,,,,,San Francisco,
freq,,13,7036,,,,,,,,,148654,
mean,74327.5,,,66325.448841,5066.059886,3648.767297,25007.893151,74768.321972,93692.554811,2012.522643,,,
std,42912.857795,,,42764.635495,11454.380559,8056.601866,15402.215858,50517.005274,62793.533483,1.117538,,,
min,1.0,,,-166.01,-0.01,-7058.59,-33.89,-618.13,-618.13,2011.0,,,
25%,37164.25,,,33588.2,0.0,0.0,11535.395,36168.995,44065.65,2012.0,,,
50%,74327.5,,,65007.45,0.0,811.27,28628.62,71426.61,92404.09,2013.0,,,
75%,111490.75,,,94691.05,4658.175,4236.065,35566.855,105839.135,132876.45,2014.0,,,


#### **Kolom Numerikal**

In [None]:
# Mean (rata-rata) suatu kolom
df['BasePay'].mean()

66325.44884050643

In [None]:
# Median (nilai tengah suatu kolom)
df['BasePay'].median()

65007.45

In [None]:
# Mode (nilai yang paling sering muncul pada suatu kolom)
df['Year'].mode()

0    2014
dtype: int64

In [None]:
# Nilai minimum
df['BasePay'].min()

-166.01

In [None]:
# Nilai maximum
df['BasePay'].max()

319275.01

In [None]:
# Nilai standard deviation
df['BasePay'].std()

42764.63549525958

In [None]:
# Count
df['BasePay'].count()

148045

In [None]:
# Menghitung frekuensi dari tiap unique value pada suatu kolom
df['Year'].value_counts()

2014    38123
2013    37606
2012    36766
2011    36159
Name: Year, dtype: int64

In [None]:
# Menghitung frekuensi dari tiap unique value pada suatu kolom
# Jika ingin ditampilkan dalam bentuk persentase, gunakan parameter
    # normalize=True

round(df['Year'].value_counts(normalize=True) * 100, 2)

2014    25.65
2013    25.30
2012    24.73
2011    24.32
Name: Year, dtype: float64

In [None]:
# Contoh lain
# Mengecek top 10 job title
df['JobTitle'].value_counts().head(10)

Transit Operator                7036
Special Nurse                   4389
Registered Nurse                3736
Public Svc Aide-Public Works    2518
Police Officer 3                2421
Custodian                       2418
TRANSIT OPERATOR                2388
Firefighter                     2359
Recreation Leader               1971
Patient Care Assistant          1945
Name: JobTitle, dtype: int64

In [None]:
# Mengecek top 10 job title dalam bentuk persentase 
    # dengan pembulatan 2 angka di belakang koma.
round(df['JobTitle'].value_counts(normalize=True) * 100, 2).head(10)

Transit Operator                4.73
Special Nurse                   2.95
Registered Nurse                2.51
Public Svc Aide-Public Works    1.69
Police Officer 3                1.63
Custodian                       1.63
TRANSIT OPERATOR                1.61
Firefighter                     1.59
Recreation Leader               1.33
Patient Care Assistant          1.31
Name: JobTitle, dtype: float64

#### **Kolom Kategorikal**

In [None]:
# Nilai yang paling sering muncul
df['JobTitle'].mode()

0    Transit Operator
dtype: object

In [None]:
# Nilai yang paling sering muncul
# Jika ingin mengambil string value-nya saja
df['JobTitle'].mode().values[0]

'Transit Operator'

In [None]:
# Menampilkan nilai unik pada suatu kolom
df['JobTitle'].unique()

array(['GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY',
       'CAPTAIN III (POLICE DEPARTMENT)',
       'WIRE ROPE CABLE MAINTENANCE MECHANIC', ..., 'Conversion',
       'Cashier 3', 'Not provided'], dtype=object)

In [None]:
# Menampilkan jumlah nilai unik (kategori) pada suatu kolom
df['JobTitle'].nunique()

2159

### **Missing Value**

In [None]:
df_num = pd.DataFrame({
                'A': [1, 2, np.nan, 10],
                'B': [23, np.nan, np.nan, 4],
                'C': [10, 20, 30, 40]
})

df_num

Unnamed: 0,A,B,C
0,1.0,23.0,10
1,2.0,,20
2,,,30
3,10.0,4.0,40


In [None]:
# Default-nya adalah axis=0
df_num.isna().sum()

A    1
B    2
C    0
dtype: int64

In [None]:
# Cek jumlah missing value pada tiap baris
df_num.isna().sum(axis=1)

0    0
1    1
2    2
3    0
dtype: int64

In [None]:
# Menghapus missing value menggunakan .dropna()
# Default-nya menghapus berdasarkan baris (axis=0)
# Sifatnya tidak permanen
# Jika mau permanen, tambahkan inplace=True atau timpa ke suatu variabel
df_num.dropna()

Unnamed: 0,A,B,C
0,1.0,23.0,10
3,10.0,4.0,40


In [None]:
# Menghapus NaN value berdasarkan kolom (axis=1) secara permanen
df_drop = df_num.copy()
df_drop.dropna(axis=1, inplace=True)
df_drop

Unnamed: 0,C
0,10
1,20
2,30
3,40


In [None]:
df_num

Unnamed: 0,A,B,C
0,1.0,23.0,10
1,2.0,,20
2,,,30
3,10.0,4.0,40


In [None]:
# Threshold berarti jumlah minimal data yang bukan berupa missing value.
# By default, axis pada thresh adalah axis=0
# Misal, thresh = 3, berarti hanya baris yang memiliki minimal 3 data poin
    # yang bukan bernilai NaN yang tidak akan terhapus
df_num.dropna(thresh=3)

Unnamed: 0,A,B,C
0,1.0,23.0,10
3,10.0,4.0,40


In [None]:
# Menghapus kolom yang tidak memiliki 3 data poin
df_num.dropna(axis=1, thresh=3)

Unnamed: 0,A,C
0,1.0,10
1,2.0,20
2,,30
3,10.0,40


In [None]:
# Mengisi missing value dengan nilai tertentu
# Sifat default-nya tidak permanen
df_num.fillna(value='Unknown')

Unnamed: 0,A,B,C
0,1.0,23.0,10
1,2.0,Unknown,20
2,Unknown,Unknown,30
3,10.0,4.0,40


In [None]:
# Jika hanya ingin mengisi missing value pada suatu kolom
df_num['A'].fillna(df_num['A'].mean())

0     1.000000
1     2.000000
2     4.333333
3    10.000000
Name: A, dtype: float64

In [None]:
# Mengisi missing value pada kolom A dengan nilai mean dari kolom A secara permanen
df_num['A'].fillna(df_num['A'].mean(), inplace=True)
df_num

Unnamed: 0,A,B,C
0,1.0,23.0,10
1,2.0,,20
2,4.333333,,30
3,10.0,4.0,40


### **Data Aggregation**

In [None]:
df['Year'].value_counts(ascending=True)

2011    36159
2012    36766
2013    37606
2014    38123
Name: Year, dtype: int64

In [None]:
# Mengelompokkan data berdasarkan kolom Year
# Jika tidak dipasangkan dengan suatu aggregation function, maka 
    # return value-nya adalah group by object
df.groupby('Year')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000201F06187F0>

In [None]:
# Mencari nilai rata-rata dari tiap kolom numerikal berdasarkan kategori pada kolom Year 
df.groupby('Year').mean()

Unnamed: 0_level_0,Id,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Notes,Status
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011,18080.0,63595.956517,4531.065429,3617.081926,,71744.103871,71744.103871,,
2012,54542.5,65436.406857,5023.417824,3653.437583,26439.966967,74113.262265,100553.229232,,
2013,91728.5,69630.030216,5281.64198,3819.969007,23829.076572,77611.443142,101440.519714,,
2014,129593.0,66564.421924,5401.993737,3505.421251,24789.601756,75463.91814,100250.918884,,


In [None]:
# Mengecek rata-rata BasePay dan OvertimePay tiap tahunnya
df.groupby('Year').mean()[['BasePay', 'OvertimePay']]

Unnamed: 0_level_0,BasePay,OvertimePay
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2011,63595.956517,4531.065429
2012,65436.406857,5023.417824
2013,69630.030216,5281.64198
2014,66564.421924,5401.993737


In [None]:
# Grouping lebih dari 1 kolom
df.groupby(['Year', 'JobTitle']).mean()[['BasePay', 'OvertimePay']]

Unnamed: 0_level_0,Unnamed: 1_level_0,BasePay,OvertimePay
Year,JobTitle,Unnamed: 2_level_1,Unnamed: 3_level_1
2011,ACCOUNT CLERK,43300.806506,373.200843
2011,ACCOUNTANT,46643.172000,0.000000
2011,ACCOUNTANT INTERN,28732.663958,24.430625
2011,ACUPUNCTURIST,66374.400000,0.000000
2011,ADMINISTRATIVE ANALYST,63435.628602,0.000000
...,...,...,...
2014,Wire Rope Cable Maint Sprv,93904.720000,79173.850000
2014,Worker's Comp Supervisor 1,54505.333333,0.000000
2014,Worker's Compensation Adjuster,69915.985385,0.000000
2014,X-Ray Laboratory Aide,45490.340286,4530.769714


In [None]:
# Jika ingin melihat lebih banyak jumlah baris pada outcome
pd.set_option('display.max_rows', 5000) # max 5000 rows
pd.set_option('display.max_columns', 100) # max 100 columns

In [None]:
df.groupby('Year').mean()[['TotalPay']]

Unnamed: 0_level_0,TotalPay
Year,Unnamed: 1_level_1
2011,71744.103871
2012,74113.262265
2013,77611.443142
2014,75463.91814


In [None]:
# Mengakses rata-rata TotalPay tahun 2012 menggunakan .loc
df.groupby('Year').mean()[['TotalPay']].loc[2012]

TotalPay    74113.262265
Name: 2012, dtype: float64

In [None]:
# Mengakses rata-rata TotalPay tahun 2012 menggunakan .iloc
df.groupby('Year').mean()[['TotalPay']].iloc[1]

TotalPay    74113.262265
Name: 2012, dtype: float64

In [None]:
# Membuat kolom baru dari hasil operasi math kolom-kolom yang sudah ada
df['TotalBenefits'] = df['BasePay'] + df['OtherPay']
df.head(3)

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status,TotalBenefits
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,,567595.43
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,,293777.4
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,,229191.73


In [None]:
# Transpose describe function
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,148654.0,74327.5,42912.857795,1.0,37164.25,74327.5,111490.75,148654.0
BasePay,148045.0,66325.448841,42764.635495,-166.01,33588.2,65007.45,94691.05,319275.01
OvertimePay,148650.0,5066.059886,11454.380559,-0.01,0.0,0.0,4658.175,245131.88
OtherPay,148650.0,3648.767297,8056.601866,-7058.59,0.0,811.27,4236.065,400184.25
Benefits,112491.0,25007.893151,15402.215858,-33.89,11535.395,28628.62,35566.855,96570.66
TotalPay,148654.0,74768.321972,50517.005274,-618.13,36168.995,71426.61,105839.135,567595.43
TotalPayBenefits,148654.0,93692.554811,62793.533483,-618.13,44065.65,92404.09,132876.45,567595.43
Year,148654.0,2012.522643,1.117538,2011.0,2012.0,2013.0,2014.0,2014.0
Notes,0.0,,,,,,,
Status,0.0,,,,,,,


In [None]:
# Menampilkan descriptive stats kolom Benefits saja
df.describe().T.loc['Benefits']

count    112491.000000
mean      25007.893151
std       15402.215858
min         -33.890000
25%       11535.395000
50%       28628.620000
75%       35566.855000
max       96570.660000
Name: Benefits, dtype: float64