# BNPB Data Manipulation

---

## Import packages

`%pip freeze > requirements.txt`

In [2]:
# Data manipulation
import pandas as pd

# JSON manipulation
import json

# Regular expression
import re

## Load JSONs data

### Initial data

In [3]:
# Open JSON file
initial = open('../data/raw/new-bnpb-data-20220125075854.json')

# Returns JSON object as a dictionary
data_initial = json.load(initial)

In [4]:
# JSON
data_initial

[{'No': '1',
  'ID': '3512999202201251',
  'Area': 'Situbondo, East Java',
  'Disasters': 'Others',
  'Victims': {'Died': 0,
   'Missing': 0,
   'Injured': 0,
   'Suffered': 0,
   'Refugees': 0},
  'Property Damages': {'House': 0,
   'School': 0,
   'Health Facility': 0,
   'Places of Worship': 0,
   'Public Facility': 0,
   'Office Building': 0,
   'Bridge': 0,
   'Factory Building': 0,
   'Store': 0},
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55969//////2//1'},
 {'No': '2',
  'ID': '3325999202201231',
  'Area': 'Batang, Central Java',
  'Disasters': 'Others',
  'Victims': {'Died': 0,
   'Missing': 0,
   'Injured': 0,
   'Suffered': 0,
   'Refugees': 0},
  'Property Damages': {'House': 0,
   'School': 0,
   'Health Facility': 0,
   'Places of Worship': 0,
   'Public Facility': 0,
   'Office Building': 0,
   'Bridge': 0,
   'Factory Building': 0,
   'Store': 0},
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55935//////2//2'},
 {'No': '3',
  'ID': '3325105202201231',
  'Area': 'Batang

In [15]:
# JSON restructuring

# Create a list
data_list = []

for idx in range(len(data_initial)):
    # Initial dictionary for links
    dict_data = {}
    
    # Check values in keys
    for key in data_initial[idx].keys():
        if key not in ['Victims', 'Property Damages']:
            # Append with initial dictionary
            dict_data = {**dict_data, **{key: data_initial[idx][key]}}
        else:
            # Append with initial dictionary
            dict_data = {**dict_data, **data_initial[idx][key]}
    
    # Append to list of dictionary
    data_list.append(dict_data)

In [16]:
data_list

[{'No': '1',
  'ID': '3512999202201251',
  'Area': 'Situbondo, East Java',
  'Disasters': 'Others',
  'Died': 0,
  'Missing': 0,
  'Injured': 0,
  'Suffered': 0,
  'Refugees': 0,
  'House': 0,
  'School': 0,
  'Health Facility': 0,
  'Places of Worship': 0,
  'Public Facility': 0,
  'Office Building': 0,
  'Bridge': 0,
  'Factory Building': 0,
  'Store': 0,
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55969//////2//1'},
 {'No': '2',
  'ID': '3325999202201231',
  'Area': 'Batang, Central Java',
  'Disasters': 'Others',
  'Died': 0,
  'Missing': 0,
  'Injured': 0,
  'Suffered': 0,
  'Refugees': 0,
  'House': 0,
  'School': 0,
  'Health Facility': 0,
  'Places of Worship': 0,
  'Public Facility': 0,
  'Office Building': 0,
  'Bridge': 0,
  'Factory Building': 0,
  'Store': 0,
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55935//////2//2'},
 {'No': '3',
  'ID': '3325105202201231',
  'Area': 'Batang, Central Java',
  'Disasters': 'Tornado',
  'Died': 0,
  'Missing': 0,
  'Injured': 0,
  'Sch

In [18]:
# Convert data from list of dictionary to data frame
df_initial = pd.DataFrame(data_list)

In [19]:
print('Dimension of data: {} rows and {} columns'.format(len(df_initial), len(df_initial.columns)))
df_initial.head()

Dimension of data: 39489 rows and 19 columns


Unnamed: 0,No,ID,Area,Disasters,Died,Missing,Injured,Suffered,Refugees,House,School,Health Facility,Places of Worship,Public Facility,Office Building,Bridge,Factory Building,Store,URL
0,1,3512999202201251,"Situbondo, East Java",Others,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55969//////...
1,2,3325999202201231,"Batang, Central Java",Others,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55935//////...
2,3,3325105202201231,"Batang, Central Java",Tornado,0.0,0.0,0.0,,,,1.0,0.0,1.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55936//////...
3,4,3301105202201231,"Cilacap, Central Java",Tornado,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55965//////...
4,5,3301105202201232,"Cilacap, Central Java",Tornado,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55966//////...


In [24]:
# Detect missing values
df_initial.isna().sum()

No                      0
ID                      0
Area                    0
Disasters               0
Died                  325
Missing                62
Injured               509
Suffered             6985
Refugees             3422
House                6550
School                390
Health Facility        96
Places of Worship     305
Public Facility         3
Office Building       113
Bridge                131
Factory Building        0
Store                 101
URL                     0
dtype: int64

In [25]:
# Handle missing values
df_initial.fillna(
    value = 0,
    inplace = True
)

### Detailed data

In [27]:
# Open JSON file
detailed = open('../data/raw/new-bnpb-data-detailed-20220125154512.json')

# Returns JSON object as a dictionary
data_detailed = json.load(detailed)

In [28]:
# JSON
data_detailed

[{'No': '1',
  'ID': '3512999202201251',
  'Date': '2022-01-25',
  'Latitude': '-7.711043',
  'Longitude': '114.012157',
  'Area Code': '3512',
  'Province': 'East Java',
  'District': 'Situbondo',
  'Disasters': 'Others',
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55969//////2//1'},
 {'No': '2',
  'ID': '3325999202201231',
  'Date': '2022-01-23',
  'Latitude': '-6.94789783665541',
  'Longitude': '109.80251186766762',
  'Area Code': '3325',
  'Province': 'Central Java',
  'District': 'Batang',
  'Disasters': 'Others',
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55935//////2//2'},
 {'No': '3',
  'ID': '3325105202201231',
  'Date': '2022-01-23',
  'Latitude': '-6.960166533552673',
  'Longitude': '109.83359860768684',
  'Area Code': '3325',
  'Province': 'Central Java',
  'District': 'Batang',
  'Disasters': 'Tornado',
  'URL': 'https://dibi.bnpb.go.id/xdibi/read/55936//////2//3'},
 {'No': '4',
  'ID': '3301105202201231',
  'Date': '2022-01-23',
  'Latitude': '-7.26602',
  'Longitude':

In [29]:
# Convert data from list of dictionary to data frame
df_detailed = pd.DataFrame(data_detailed)

In [30]:
print('Dimension of data: {} rows and {} columns'.format(len(df_detailed), len(df_detailed.columns)))
df_detailed.head()

Dimension of data: 39489 rows and 10 columns


Unnamed: 0,No,ID,Date,Latitude,Longitude,Area Code,Province,District,Disasters,URL
0,1,3512999202201251,2022-01-25,-7.711043,114.012157,3512,East Java,Situbondo,Others,https://dibi.bnpb.go.id/xdibi/read/55969//////...
1,2,3325999202201231,2022-01-23,-6.94789783665541,109.80251186766762,3325,Central Java,Batang,Others,https://dibi.bnpb.go.id/xdibi/read/55935//////...
2,3,3325105202201231,2022-01-23,-6.960166533552673,109.83359860768684,3325,Central Java,Batang,Tornado,https://dibi.bnpb.go.id/xdibi/read/55936//////...
3,4,3301105202201231,2022-01-23,-7.26602,108.751081,3301,Central Java,Cilacap,Tornado,https://dibi.bnpb.go.id/xdibi/read/55965//////...
4,5,3301105202201232,2022-01-23,-7.247742,108.775541,3301,Central Java,Cilacap,Tornado,https://dibi.bnpb.go.id/xdibi/read/55966//////...


In [31]:
# Detect missing values
df_detailed.isna().sum()

No           0
ID           0
Date         1
Latitude     1
Longitude    1
Area Code    1
Province     1
District     1
Disasters    0
URL          0
dtype: int64

## Merge the data and perform preprocessing

In [131]:
# Merge the data into one single source
df_final = df_initial.merge(
    right = df_detailed[['Latitude', 'Longitude', 'Area Code', 'Province', 'District', 'URL', 'Date']],
    on = 'URL',
    how = 'left'
)

In [133]:
# Reorder columns
cols = [
    'No', 'ID', 'Date', 'Area Code', 'Area', 'Province', 'District', 'Latitude', 
    'Longitude', 'Disasters', 'Died', 'Missing', 'Injured', 'Suffered', 
    'Refugees', 'House', 'School', 'Health Facility', 'Places of Worship',
    'Public Facility', 'Office Building', 'Bridge', 'Factory Building', 'Store', 'URL'
]
df_final = df_final[cols]

In [134]:
# Show the data
df_final.head()

Unnamed: 0,No,ID,Date,Area Code,Area,Province,District,Latitude,Longitude,Disasters,...,House,School,Health Facility,Places of Worship,Public Facility,Office Building,Bridge,Factory Building,Store,URL
0,1,3512999202201251,2022-01-25,3512,"Situbondo, East Java",East Java,Situbondo,-7.711043,114.012157,Others,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55969//////...
1,2,3325999202201231,2022-01-23,3325,"Batang, Central Java",Central Java,Batang,-6.94789783665541,109.80251186766762,Others,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55935//////...
2,3,3325105202201231,2022-01-23,3325,"Batang, Central Java",Central Java,Batang,-6.960166533552673,109.83359860768684,Tornado,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55936//////...
3,4,3301105202201231,2022-01-23,3301,"Cilacap, Central Java",Central Java,Cilacap,-7.26602,108.751081,Tornado,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55965//////...
4,5,3301105202201232,2022-01-23,3301,"Cilacap, Central Java",Central Java,Cilacap,-7.247742,108.775541,Tornado,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55966//////...


In [135]:
# Drop duplicated values
df_final.drop_duplicates(
    subset = ['ID'],
    keep = 'first',
    inplace = True,
    ignore_index = True
)

In [137]:
# Show the data
print('Dimension of data: {} rows and {} columns'.format(len(df_final), len(df_final.columns)))
df_final.head()

Dimension of data: 39480 rows and 25 columns


Unnamed: 0,No,ID,Date,Area Code,Area,Province,District,Latitude,Longitude,Disasters,...,House,School,Health Facility,Places of Worship,Public Facility,Office Building,Bridge,Factory Building,Store,URL
0,1,3512999202201251,2022-01-25,3512,"Situbondo, East Java",East Java,Situbondo,-7.711043,114.012157,Others,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55969//////...
1,2,3325999202201231,2022-01-23,3325,"Batang, Central Java",Central Java,Batang,-6.94789783665541,109.80251186766762,Others,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55935//////...
2,3,3325105202201231,2022-01-23,3325,"Batang, Central Java",Central Java,Batang,-6.960166533552673,109.83359860768684,Tornado,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55936//////...
3,4,3301105202201231,2022-01-23,3301,"Cilacap, Central Java",Central Java,Cilacap,-7.26602,108.751081,Tornado,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55965//////...
4,5,3301105202201232,2022-01-23,3301,"Cilacap, Central Java",Central Java,Cilacap,-7.247742,108.775541,Tornado,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,https://dibi.bnpb.go.id/xdibi/read/55966//////...


In [144]:
# Dictionary of data type
change_data_type = {
    'No': int,
    'Latitude': float,
    'Longitude': float,
    'Died': int,
    'Missing': int,
    'Injured': int,
    'Suffered': int,
    'Refugees': int,
    'House': int,
    'School': int,
    'Health Facility': int,
    'Places of Worship': int,
    'Public Facility': int,
    'Office Building': int,
    'Bridge': int,
    'Store': int
}

# Change data type
df_final = df_final.astype(change_data_type)

## Store the data to JSON format

In [146]:
# Store to JSON format
df_final.to_json(
    path_or_buf = '../data/raw/new-bnpb-data-full-20220125154512.json',
    orient = 'records'
)