In [2]:
import os
import math

import altair as alt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import ShortType

In [1]:
df_path = r"F:\Datasets\CSV datasets\earthquakes_usgs.csv"

In [3]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [6]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection - Optimized Local')
    .master('local[*]')
    .config("spark.driver.memory", "60g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')
    .config('spark.sql.shuffle.partitions', '100')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.sql.autoBroadcastJoinThreshold', '256mb')
    .getOrCreate()
)

print(f"SparkSession configured with Driver Memory: {spark.conf.get('spark.driver.memory')}")

SparkSession configured with Driver Memory: 60g


In [7]:
df = spark.read.option(
    "header", "true"
).option(
    "inferSchema", "true"
).csv(df_path)

In [11]:
df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- depth: double (nullable = true)
 |-- mag: double (nullable = true)
 |-- magType: string (nullable = true)
 |-- nst: double (nullable = true)
 |-- gap: double (nullable = true)
 |-- dmin: double (nullable = true)
 |-- rms: double (nullable = true)
 |-- net: string (nullable = true)
 |-- id: string (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- place: string (nullable = true)
 |-- type: string (nullable = true)
 |-- horizontalError: double (nullable = true)
 |-- depthError: double (nullable = true)
 |-- magError: double (nullable = true)
 |-- magNst: double (nullable = true)
 |-- status: string (nullable = true)
 |-- locationSource: string (nullable = true)
 |-- magSource: string (nullable = true)



## Explanation of Earthquake Data Schema Columns

This describes the meaning of each column found in the earthquake dataset schema.

---

-   **`time` (timestamp)**: The precise date and time when the earthquake occurred (the origin time) at the source. Usually in UTC.

-   **`latitude` (double)**: The geographic latitude of the earthquake's epicenter (point on surface above origin). Decimal degrees (N+, S-).

-   **`longitude` (double)**: The geographic longitude of the earthquake's epicenter. Decimal degrees (E+, W-).

-   **`depth` (double)**: The depth of the earthquake's hypocenter (focus) below the Earth's surface. Usually in kilometers (km).

-   **`mag` (double)**: The magnitude (size/strength) of the earthquake. Scale depends on `magType`.

-   **`magType` (string)**: Method/scale used for magnitude (e.g., Mw, Ml, Mb, Ms, Md).

-   **`nst` (double)**: Number of Seismic Stations used to determine location. Higher numbers often mean more reliability.

-   **`gap` (double)**: Azimuthal Gap (degrees). Largest angle between adjacent stations. Smaller gaps (< 180°) mean better coverage.

-   **`dmin` (double)**: Minimum Distance (degrees or km) from epicenter to the nearest station. Smaller values often improve accuracy.

-   **`rms` (double)**: Root Mean Square (seconds) of travel-time residuals. Lower values indicate a better fit of the location/time model to data.

-   **`net` (string)**: Network Identifier of the primary source/contributor (e.g., 'us', 'ci', 'ak').

-   **`id` (string)**: Unique Event ID assigned by the source network (`net`).

-   **`updated` (timestamp)**: Timestamp when this event information was last modified in the source database.

-   **`place` (string)**: Human-readable description of the approximate location (e.g., "9km ENE of Pāhala, Hawaii").

-   **`type` (string)**: Type of seismic event (e.g., 'earthquake', 'quarry blast', 'explosion').

-   **`horizontalError` (double)**: Uncertainty/error estimate for the epicenter location (latitude/longitude), usually in km.

-   **`depthError` (double)**: Uncertainty/error estimate for the hypocenter depth, usually in km.

-   **`magError` (double)**: Uncertainty/error estimate for the magnitude (`mag`).

-   **`magNst` (double)**: Number of Stations used specifically to calculate the magnitude (`mag`). Higher numbers often mean more reliability.

-   **`status` (string)**: Review status (e.g., 'automatic', 'reviewed', 'deleted').

-   **`locationSource` (string)**: Network/agency providing the authoritative location solution.

-   **`magSource` (string)**: Network/agency providing the authoritative magnitude solution.

---

> _**Note:** All columns are potentially nullable, meaning they might contain missing (NULL) values for a given event record._