In [1]:
import pandas as pd
import requests as r
import json

# Data Ingestion

Steps consists of:
- Fetch json data from API
- Save json data to local
- Transform from json to parquet
- Save parquet data to local
- Upload json and parquet data to data lake

## Fetch Data

| Column name | Description | Data type |
| --- | --- | --- |
| time | The time of the earthquake. | Datetime |
| latitude | The latitude of the earthquake. | Float |
| longitude | The longitude of the earthquake. | Float |
| depth | The depth of the earthquake. | Float |
| mag | The magnitude of the earthquake. | Float |
| magType | The type of magnitude measurement used. | String |
| nst | The number of seismic stations used to calculate the magnitude. | Integer |
| gap | The maximum angular distance between azimuthal gaps. | Float |
| dmin | The distance to the nearest station. | Float |
| rms | The root-mean-square travel time residual. | Float |
| net | The network detected. | String |
| id |  |  |
| updated | The time the earthquake was last updated. | Datetime |
| place | The location of the earthquake. | String |
| type | | |
| horizontalError | The horizontal error of the earthquake. | Float |
| depthError | The depth error of the earthquake. | Float |
| magError | The magnitude error of the earthquake. | Float |
| magNst | The number of seismic stations used to calculate the magnitude error. | Integer |
| status | The status of the earthquake. | String |
| locationSource | The source of the location of the earthquake. | String |
| magSource | | |arthquake. (String)

In [2]:
format = "geojson"
starttime = "2023-01-01"
endtime = "2023-01-02"

response = r.get(f"https://earthquake.usgs.gov/fdsnws/event/1/query?format={format}&starttime={starttime}&endtime={endtime}")

In [3]:
json_data = response.json()

I want to know which data range retrieved from params `starttime=2023-01-01` & `endtime=2023-01-02`

In [4]:
import datetime

In [5]:
time_ls = []
for i in range(json_data['metadata']['count']):
    time_eq = json_data['features'][i]['properties']['time']
    time_ls.append(datetime.datetime.fromtimestamp(time_eq/1000.0))

print(max(time_ls), min(time_ls))

2023-01-01 23:59:29.195000 2023-01-01 00:06:14.840000


The data retrieved is exclusive to `starttime` (2023-01-01) range (not include `endtime` (2023-01-02))

## Save json data to local

In [6]:
with open(f"data/raw/json/data_{starttime}.json", "w") as f:
    json.dump(json_data, f)

## Convert from json to parquet

In [7]:
json_data.keys()

dict_keys(['type', 'metadata', 'features', 'bbox'])

In [8]:
json_data['type']

'FeatureCollection'

In [9]:
json_data['metadata'].keys()

dict_keys(['generated', 'url', 'title', 'status', 'api', 'count'])

In [10]:
# metadata of the files retirieved
json_data['metadata']

{'generated': 1700308166000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2023-01-01&endtime=2023-01-02',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.14.0',
 'count': 349}

In [11]:
# min long, min lat, min depth, max long, max lat, max depth
json_data['bbox']

[-178.81666666667, -60.8195, -3.05, 179.599, 67.2198, 595.724]

In [12]:
# first record
json_data['features'][0].keys()

dict_keys(['type', 'properties', 'geometry', 'id'])

In [13]:
json_data['features'][0]['type']

'Feature'

In [14]:
# 26 features
json_data['features'][0]['properties'].keys()

dict_keys(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi', 'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids', 'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type', 'title'])

In [15]:
json_data['features'][0]['properties']

{'mag': 4.2,
 'place': '5 km NNE of Jayapura, Indonesia',
 'time': 1672617569195,
 'updated': 1678575106040,
 'tz': None,
 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000j3yb',
 'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=us7000j3yb&format=geojson',
 'felt': None,
 'cdi': None,
 'mmi': None,
 'alert': None,
 'status': 'reviewed',
 'tsunami': 0,
 'sig': 271,
 'net': 'us',
 'code': '7000j3yb',
 'ids': ',us7000j3yb,',
 'sources': ',us,',
 'types': ',origin,phase-data,',
 'nst': 19,
 'dmin': 16.495,
 'rms': 0.45,
 'gap': 99,
 'magType': 'mb',
 'type': 'earthquake',
 'title': 'M 4.2 - 5 km NNE of Jayapura, Indonesia'}

In [16]:
# coordinates: latitude, longitude, depth
json_data['features'][0]['geometry']

{'type': 'Point', 'coordinates': [140.7385, -2.493, 10]}

In [17]:
json_data['features'][0]['id']

'us7000j3yb'

### Flatten and preprocessing

Challenges:
- Flatten nested json data
- Fill null values
- Convert data type

In [35]:
df_json = pd.json_normalize(json_data, record_path=["features"], meta="metadata", sep="_")

df_json.head()

Unnamed: 0,type,id,properties_mag,properties_place,properties_time,properties_updated,properties_tz,properties_url,properties_detail,properties_felt,...,properties_nst,properties_dmin,properties_rms,properties_gap,properties_magType,properties_type,properties_title,geometry_type,geometry_coordinates,metadata
0,Feature,us7000j3yb,4.2,"5 km NNE of Jayapura, Indonesia",1672617569195,1678575106040,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,19.0,16.495,0.45,99.0,mb,earthquake,"M 4.2 - 5 km NNE of Jayapura, Indonesia",Point,"[140.7385, -2.493, 10]","{'generated': 1700308166000, 'url': 'https://e..."
1,Feature,pr2023001004,3.62,"103 km N of Suárez, Puerto Rico",1672617413930,1672619526212,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,14.0,1.1104,0.44,270.0,md,earthquake,"M 3.6 - 103 km N of Suárez, Puerto Rico",Point,"[-65.7256, 19.3601, 34]","{'generated': 1700308166000, 'url': 'https://e..."
2,Feature,av91082783,-0.76,"85 km NNW of Karluk, Alaska",1672617277550,1672860382700,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,6.0,,0.06,217.0,ml,earthquake,"M -0.8 - 85 km NNW of Karluk, Alaska",Point,"[-155.180333333333, 58.2275, 3.08]","{'generated': 1700308166000, 'url': 'https://e..."
3,Feature,nc73827681,0.57,"10km NW of The Geysers, CA",1672617223560,1673410751822,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,21.0,0.007542,0.02,96.0,md,earthquake,"M 0.6 - 10km NW of The Geysers, CA",Point,"[-122.8415, 38.8445, 2.06]","{'generated': 1700308166000, 'url': 'https://e..."
4,Feature,pr71390293,2.28,,1672617148580,1672618140720,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,5.0,0.09231,0.08,242.0,md,earthquake,M 2.3 -,Point,"[-66.8565, 17.8848333333333, 11.19]","{'generated': 1700308166000, 'url': 'https://e..."


In [36]:
df_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  349 non-null    object 
 1   id                    349 non-null    object 
 2   properties_mag        348 non-null    float64
 3   properties_place      327 non-null    object 
 4   properties_time       349 non-null    int64  
 5   properties_updated    349 non-null    int64  
 6   properties_tz         0 non-null      object 
 7   properties_url        349 non-null    object 
 8   properties_detail     349 non-null    object 
 9   properties_felt       31 non-null     float64
 10  properties_cdi        31 non-null     float64
 11  properties_mmi        7 non-null      float64
 12  properties_alert      5 non-null      object 
 13  properties_status     349 non-null    object 
 14  properties_tsunami    349 non-null    int64  
 15  properties_sig        3

Flatten `geometry_coordinates` feature

In [37]:
# flatten geometry_coordinates
df_json[["geometry_coordinates_latitude", "geometry_coordinates_longitude", "geometry_coordinates_depth"]] = df_json["geometry_coordinates"].tolist()

In [39]:
df_json.drop(["geometry_coordinates"], axis=1, inplace=True)

Flatten `geometry_coordinates` feature

In [40]:
df_json['metadata'][0]

{'generated': 1700308166000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2023-01-01&endtime=2023-01-02',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.14.0',
 'count': 349}

In [41]:
df_json = pd.concat(
    [df_json, pd.json_normalize(df_json["metadata"]).add_prefix("metadata_")],
    axis=1
)

In [43]:
df_json.drop(["metadata"], axis=1, inplace=True)

Fill null values

In [58]:
df_json["properties_cdi"]

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
344   NaN
345   NaN
346   NaN
347   NaN
348   NaN
Name: properties_cdi, Length: 349, dtype: float64

In [51]:
df_json.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   type                            349 non-null    object 
 1   id                              349 non-null    object 
 2   properties_mag                  348 non-null    float64
 3   properties_place                327 non-null    object 
 4   properties_time                 349 non-null    int64  
 5   properties_updated              349 non-null    int64  
 6   properties_tz                   0 non-null      object 
 7   properties_url                  349 non-null    object 
 8   properties_detail               349 non-null    object 
 9   properties_felt                 31 non-null     float64
 10  properties_cdi                  31 non-null     float64
 11  properties_mmi                  7 non-null      float64
 12  properties_alert                5 no

In [71]:
df_json["metadata_generated_datetime"] = pd.to_datetime(df_json["metadata_generated"], unit="ms")

In [72]:
df_json["properties_time_datetime"] = pd.to_datetime(df_json["properties_time"], unit="ms")

In [73]:
df_json["properties_updated_datetime"] = pd.to_datetime(df_json["properties_updated"], unit="ms")

In [81]:
df_json[['properties_felt', 'properties_nst']] = df_json[['properties_felt', 'properties_nst']].astype('Int64')

In [82]:
df_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 41 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   type                            349 non-null    object        
 1   id                              349 non-null    object        
 2   properties_mag                  348 non-null    float64       
 3   properties_place                327 non-null    object        
 4   properties_time                 349 non-null    int64         
 5   properties_updated              349 non-null    int64         
 6   properties_tz                   0 non-null      object        
 7   properties_url                  349 non-null    object        
 8   properties_detail               349 non-null    object        
 9   properties_felt                 31 non-null     Int64         
 10  properties_cdi                  31 non-null     float64       
 11  proper

In [83]:
df_json.to_parquet(f"data/raw/parquet/data_{starttime}.parquet", compression="gzip")

## Save parquet data to locale

In [129]:
df_pq = pd.read_parquet("data/raw/parquet/data_2023-01-01.parquet")

df_pq.head()

Unnamed: 0,type,id,properties_mag,properties_place,properties_time,properties_updated,properties_tz,properties_url,properties_detail,properties_felt,...,properties_nst,properties_dmin,properties_rms,properties_gap,properties_magType,properties_type,properties_title,geometry_type,geometry_coordinates,metadata
0,Feature,us7000j3yb,4.2,"5 km NNE of Jayapura, Indonesia",1672617569195,1678575106040,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,19.0,16.495,0.45,99.0,mb,earthquake,"M 4.2 - 5 km NNE of Jayapura, Indonesia",Point,"[140.7385, -2.493, 10.0]","{'api': '1.14.0', 'count': 349, 'generated': 1..."
1,Feature,pr2023001004,3.62,"103 km N of Suárez, Puerto Rico",1672617413930,1672619526212,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,14.0,1.1104,0.44,270.0,md,earthquake,"M 3.6 - 103 km N of Suárez, Puerto Rico",Point,"[-65.7256, 19.3601, 34.0]","{'api': '1.14.0', 'count': 349, 'generated': 1..."
2,Feature,av91082783,-0.76,"85 km NNW of Karluk, Alaska",1672617277550,1672860382700,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,6.0,,0.06,217.0,ml,earthquake,"M -0.8 - 85 km NNW of Karluk, Alaska",Point,"[-155.180333333333, 58.2275, 3.08]","{'api': '1.14.0', 'count': 349, 'generated': 1..."
3,Feature,nc73827681,0.57,"10km NW of The Geysers, CA",1672617223560,1673410751822,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,21.0,0.007542,0.02,96.0,md,earthquake,"M 0.6 - 10km NW of The Geysers, CA",Point,"[-122.8415, 38.8445, 2.06]","{'api': '1.14.0', 'count': 349, 'generated': 1..."
4,Feature,pr71390293,2.28,,1672617148580,1672618140720,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,5.0,0.09231,0.08,242.0,md,earthquake,M 2.3 -,Point,"[-66.8565, 17.8848333333333, 11.19]","{'api': '1.14.0', 'count': 349, 'generated': 1..."


## Upload json and parquet data to data lake

# Transform with Spark 

Berdasarkan eksploratori di atas, atribut yang dapat diambil untuk dijadikan dataset:
- properties
- geometry
- id
- metadata generated