### Load data from raw file

In [1]:
df = spark.read.option("multiline","true").json(f'Files/{current_date}_earthquake_data.json')

StatementMeta(, f5d2bd18-e48e-4b77-9822-d2a0af8967e1, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 25523258-c179-4801-9c26-d7ca5df3ba7e)

### Select only the required columns

In [2]:
from pyspark.sql.functions import col

filtered_df = df.select(
    col('properties')['status'].alias('status'),
    col('properties')['place'].alias('place'),
    col('properties')['time'].alias('time'),
    col('id'),
    col('properties')['url'].alias('url'),
    col('geometry')['coordinates'][0].alias('longitude'),
    col('geometry')['coordinates'][1].alias('latitude'),
    col('properties')['sig'].alias('significance'),
    col('properties')['net'].alias('region'),
    col('properties')['sources'].alias('sources'),
    col('properties')['types'].alias('types')
)

StatementMeta(, f5d2bd18-e48e-4b77-9822-d2a0af8967e1, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fad23f54-4980-47b4-ac28-90a7ebbab283)

### Quality checks

check no of duplicate rows

In [4]:
unique_rows = filtered_df.distinct().count()
print(f"Number of unique rows: {unique_rows} (out of {filtered_df.count()})")

StatementMeta(, f5d2bd18-e48e-4b77-9822-d2a0af8967e1, 6, Finished, Available, Finished)

Number of unique rows: 277 (out of 277)


check if all id's are unique (distinct())

In [6]:
distinct_check = filtered_df.select('id').distinct().count()
print(f"Number of distinct id's: {distinct_check} (out of {filtered_df.count()})")

StatementMeta(, f5d2bd18-e48e-4b77-9822-d2a0af8967e1, 8, Finished, Available, Finished)

Number of distinct id's: 277 (out of 277)


### Write as parquet file

In [8]:
filtered_df.write.mode("overwrite").format("parquet").save("Files/Silver/" + f'{current_date}_earthquake_data')

StatementMeta(, f5d2bd18-e48e-4b77-9822-d2a0af8967e1, 10, Finished, Available, Finished)