# Preprocess Notebook

## A. Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

## B. Data Loading

In [None]:
# # determine n_samples
# n_samples = 5000

# # raw data
# raw_data = {
#     'provinsi_id': np.random.randint(1, 35, n_samples),  # 34 provinsi
#     'kabupaten_id': np.random.randint(1, 515, n_samples),  # ~514 kab/kota
#     'kepadatan_penduduk': np.random.lognormal(5, 1.5, n_samples),
#     'tingkat_pendidikan': np.random.normal(8.5, 2.5, n_samples),  # rata2 tahun sekolah
#     'tingkat_pengangguran': np.random.gamma(2, 3, n_samples),  # %
#     'akses_air_bersih': np.random.beta(7, 2, n_samples) * 100,  # %
#     'akses_listrik': np.random.beta(8, 1.5, n_samples) * 100,  # %
#     'fasilitas_kesehatan': np.random.poisson(15, n_samples),  # per 10k penduduk
#     'jalan_aspal': np.random.beta(5, 3, n_samples) * 100,  # %
#     'luas_sawah': np.random.exponential(5000, n_samples),  # hektar
#     'pendapatan_perkapita': np.random.lognormal(14.5, 0.8, n_samples),  # rupiah/bulan
#     'jenis_wilayah': np.random.choice(['urban', 'rural'], n_samples, p=[0.6, 0.4])
# }


# data = pd.DataFrame(raw_data)

In [None]:
# # save data to csv
# data.to_csv('raw-data.csv', index=False)

In [10]:
# load data
df = pd.read_csv('raw-data.csv')
df

Unnamed: 0,provinsi_id,kabupaten_id,kepadatan_penduduk,tingkat_pendidikan,tingkat_pengangguran,akses_air_bersih,akses_listrik,fasilitas_kesehatan,jalan_aspal,luas_sawah,pendapatan_perkapita,jenis_wilayah
0,31,305,19.573242,13.014117,3.717460,74.059660,96.231669,12,86.002510,2101.762960,2.705982e+06,urban
1,1,428,6.142656,7.509753,3.052641,73.016464,88.131296,15,74.238698,5859.394052,2.016188e+06,rural
2,2,331,217.066382,7.205496,5.269829,82.949877,90.192485,18,85.095123,8691.697751,6.756116e+05,rural
3,17,31,47.495194,9.362478,5.581640,89.559780,90.009789,17,79.565273,1405.103426,4.099418e+06,urban
4,9,288,57.104310,2.637093,10.150517,92.599718,66.534906,18,86.256018,7987.044780,9.161901e+05,urban
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9,87,72.498442,3.994595,13.229348,87.149249,83.033783,22,53.322981,1210.913433,1.396884e+06,urban
4996,9,447,5770.988638,9.531927,6.121407,91.675396,90.875724,10,69.195536,4313.769797,3.128132e+06,rural
4997,2,196,106.592341,14.157095,10.343863,64.412794,84.003185,19,73.756957,1116.632631,1.301272e+07,urban
4998,4,437,35.635563,7.107328,5.505117,96.316164,91.537295,12,77.031394,258.129089,5.135350e+06,urban


In [11]:
# check data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   provinsi_id           5000 non-null   int64  
 1   kabupaten_id          5000 non-null   int64  
 2   kepadatan_penduduk    5000 non-null   float64
 3   tingkat_pendidikan    5000 non-null   float64
 4   tingkat_pengangguran  5000 non-null   float64
 5   akses_air_bersih      5000 non-null   float64
 6   akses_listrik         5000 non-null   float64
 7   fasilitas_kesehatan   5000 non-null   int64  
 8   jalan_aspal           5000 non-null   float64
 9   luas_sawah            5000 non-null   float64
 10  pendapatan_perkapita  5000 non-null   float64
 11  jenis_wilayah         5000 non-null   object 
dtypes: float64(8), int64(3), object(1)
memory usage: 468.9+ KB


Action:

- Changing number behind comma with just two numbers.

## C. Data Cleaning

In [18]:
# Change how many numbers behing comma
df = df.round({'kepadatan_penduduk': 2,
               'tingkat_pendidikan': 2,
               'tingkat_pengangguran': 2,
               'akses_air_bersih': 2,
               'akses_listrik': 2,
               'jalan_aspal': 2,
               'luas_sawah': 2,
               'pendapatan_perkapita': 2,
               })

df

Unnamed: 0,provinsi_id,kabupaten_id,kepadatan_penduduk,tingkat_pendidikan,tingkat_pengangguran,akses_air_bersih,akses_listrik,fasilitas_kesehatan,jalan_aspal,luas_sawah,pendapatan_perkapita,jenis_wilayah
0,31,305,19.57,13.01,3.72,74.06,96.23,12,86.00,2101.76,2705981.87,urban
1,1,428,6.14,7.51,3.05,73.02,88.13,15,74.24,5859.39,2016187.82,rural
2,2,331,217.07,7.21,5.27,82.95,90.19,18,85.10,8691.70,675611.60,rural
3,17,31,47.50,9.36,5.58,89.56,90.01,17,79.57,1405.10,4099417.78,urban
4,9,288,57.10,2.64,10.15,92.60,66.53,18,86.26,7987.04,916190.12,urban
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9,87,72.50,3.99,13.23,87.15,83.03,22,53.32,1210.91,1396883.87,urban
4996,9,447,5770.99,9.53,6.12,91.68,90.88,10,69.20,4313.77,3128132.00,rural
4997,2,196,106.59,14.16,10.34,64.41,84.00,19,73.76,1116.63,13012722.68,urban
4998,4,437,35.64,7.11,5.51,96.32,91.54,12,77.03,258.13,5135349.80,urban


In [19]:
df.columns

Index(['provinsi_id', 'kabupaten_id', 'kepadatan_penduduk',
       'tingkat_pendidikan', 'tingkat_pengangguran', 'akses_air_bersih',
       'akses_listrik', 'fasilitas_kesehatan', 'jalan_aspal', 'luas_sawah',
       'pendapatan_perkapita', 'jenis_wilayah'],
      dtype='object')

In [31]:
# labeling data
poverty_list = []

for i, row in df.iterrows():
    # if income per capita below Rp1.800.000
    if row['pendapatan_perkapita'] < 1_800_000:
        indicator = 0
        # if access to clean water below 80 percent
        if row['akses_air_bersih'] < 80:
            indicator += 1
        # if access to proper electrical below 85 percent
        if row['akses_listrik'] < 85:
            indicator += 1
        # if access to education below 8 years
        if row['tingkat_pendidikan'] < 8:
            indicator += 1
        
        # if minimal 2 of those indicator is fulfilled, then it is classified
        # as poverty
        if indicator >= 2:
            poverty_list.append(1)
        else:
            poverty_list.append(0)
    # if income per capita above or equal to Rp1.800.000 not classified
    # as poverty
    else:
        poverty_list.append(0)

# add new column to the data
df['kemiskinan'] = poverty_list

In [32]:
# check data
df

Unnamed: 0,provinsi_id,kabupaten_id,kepadatan_penduduk,tingkat_pendidikan,tingkat_pengangguran,akses_air_bersih,akses_listrik,fasilitas_kesehatan,jalan_aspal,luas_sawah,pendapatan_perkapita,jenis_wilayah,kemiskinan
0,31,305,19.57,13.01,3.72,74.06,96.23,12,86.00,2101.76,2705981.87,urban,0
1,1,428,6.14,7.51,3.05,73.02,88.13,15,74.24,5859.39,2016187.82,rural,0
2,2,331,217.07,7.21,5.27,82.95,90.19,18,85.10,8691.70,675611.60,rural,0
3,17,31,47.50,9.36,5.58,89.56,90.01,17,79.57,1405.10,4099417.78,urban,0
4,9,288,57.10,2.64,10.15,92.60,66.53,18,86.26,7987.04,916190.12,urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,9,87,72.50,3.99,13.23,87.15,83.03,22,53.32,1210.91,1396883.87,urban,1
4996,9,447,5770.99,9.53,6.12,91.68,90.88,10,69.20,4313.77,3128132.00,rural,0
4997,2,196,106.59,14.16,10.34,64.41,84.00,19,73.76,1116.63,13012722.68,urban,0
4998,4,437,35.64,7.11,5.51,96.32,91.54,12,77.03,258.13,5135349.80,urban,0


In [34]:
# save clean data
df.to_csv('clean-data.csv', index=False)