In [1]:
from pyspark.sql import SparkSession

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
key_filepath = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")


In [3]:
spark = SparkSession.builder \
    .master("local[8]") \
    .appName("US Accidents") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile",key_filepath  ) \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/30 12:16:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:

# Read Parquet files from GCS
parquet_files = [
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2016/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2017/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2018/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2019/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2020/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2021/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2022/933c14c388864f19a17c514e311a69b1-0.parquet",
                "gs://us-accidents-bucket/us_accidents_data/Start_Year=2023/933c14c388864f19a17c514e311a69b1-0.parquet"
                ]

df_list = [spark.read.option("header", "true").option("inferSchema", "true").parquet(file) for file in parquet_files]

merged_df = df_list[0]
for df in df_list[1:]:
    merged_df = merged_df.unionAll(df)



                                                                                

In [6]:
merged_df.show()

24/04/30 11:35:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 8:>                                                          (0 + 1) / 1]

+----+-------+--------+-------------------+-------------------+-----------------+------------------+------------+--------------------+--------------------+------------+----------+-----+-------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+--------+-------+-------+---------------+--------------+--------------+
|  ID| Source|Severity|         Start_Time|           End_Time|        Start_Lat|         Start_Lng|Distance(mi)|         Description|              Street|        City|    County|State|Country|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Weather_Condition|Crossing|Railway|Station|Traffic_Calming|Traffic_Signal|Sunrise_Sunset|
+----+-------+--------+-------------------+-------------------+-----------------+------------------+------------+--------------------+--------------------+------------+----------+-----+-------+--------------+-------------+-----------+

                                                                                

# Cleaning the data

### Steps:
> Removing unimportnant columns

> Renaming the columns

> Convert the values of the columns into easy handled values

In [5]:
# Iterate through each column in the DataFrame
for col in merged_df.columns:
    # Get the unique values for the current column
    unique_values = merged_df.select(col).distinct().collect()
    
    # Print the column name and its unique values
    print(f"Column: {col}, has {len(unique_values)} distinct values")
    # for val in unique_values:
    #     print(val[col])
    # print()

                                                                                

Column: ID, has 7728394 distinct values


                                                                                

Column: Source, has 3 distinct values


                                                                                

Column: Severity, has 4 distinct values


                                                                                

Column: Start_Time, has 6131796 distinct values


                                                                                

Column: End_Time, has 6705355 distinct values


                                                                                

Column: Start_Lat, has 2428358 distinct values


                                                                                

Column: Start_Lng, has 2482533 distinct values


                                                                                

Column: Distance(mi), has 22382 distinct values


                                                                                

Column: Description, has 3761579 distinct values


                                                                                

Column: Street, has 336307 distinct values


                                                                                

Column: City, has 13679 distinct values


                                                                                

Column: County, has 1871 distinct values


                                                                                

Column: State, has 49 distinct values


                                                                                

Column: Country, has 1 distinct values


                                                                                

Column: Temperature(F), has 861 distinct values


                                                                                

Column: Wind_Chill(F), has 1002 distinct values


                                                                                

Column: Humidity(%), has 101 distinct values


                                                                                

Column: Pressure(in), has 1145 distinct values


                                                                                

Column: Visibility(mi), has 93 distinct values


                                                                                

Column: Wind_Direction, has 25 distinct values


                                                                                

Column: Wind_Speed(mph), has 185 distinct values


                                                                                

Column: Weather_Condition, has 145 distinct values


                                                                                

Column: Crossing, has 2 distinct values


                                                                                

Column: Railway, has 2 distinct values


                                                                                

Column: Station, has 2 distinct values


                                                                                

Column: Traffic_Calming, has 2 distinct values


                                                                                

Column: Traffic_Signal, has 2 distinct values




Column: Sunrise_Sunset, has 3 distinct values


                                                                                

In [6]:
# List of column names to drop
columns_to_drop = ['Country']

# Drop the unimportant columns
cleaned_df = merged_df.drop(*columns_to_drop)


In [7]:
from pyspark.sql.functions import to_timestamp

# Convert "Start_Time" and "End_Time" columns to datetime format
cleaned_df = cleaned_df.withColumn("Start_Time", to_timestamp(cleaned_df["Start_Time"], "yyyy-MM-dd HH:mm:ss"))
cleaned_df = cleaned_df.withColumn("End_Time", to_timestamp(cleaned_df["End_Time"], "yyyy-MM-dd HH:mm:ss"))


In [8]:
from pyspark.sql.functions import col

# Convert "Distance(mi)" column from miles to meters
cleaned_df = cleaned_df.withColumn("Distance(m)", col("Distance(mi)") * 1609.34)

# Drop the original "Distance(mi)" column if needed
cleaned_df = cleaned_df.drop("Distance(mi)")


In [9]:
from pyspark.sql.types import IntegerType

# Convert "Severity" column from string to integer
cleaned_df = cleaned_df.withColumn("Severity", cleaned_df["Severity"].cast(IntegerType()))


In [10]:
cleaned_df.show()

24/04/30 12:35:44 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 92:>                                                         (0 + 1) / 1]

+----+-------+--------+-------------------+-------------------+-----------------+------------------+--------------------+--------------------+------------+----------+-----+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+--------+-------+-------+---------------+--------------+--------------+-----------+
|  ID| Source|Severity|         Start_Time|           End_Time|        Start_Lat|         Start_Lng|         Description|              Street|        City|    County|State|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Weather_Condition|Crossing|Railway|Station|Traffic_Calming|Traffic_Signal|Sunrise_Sunset|Distance(m)|
+----+-------+--------+-------------------+-------------------+-----------------+------------------+--------------------+--------------------+------------+----------+-----+--------------+-------------+-----------+------------+--------------+-----------

                                                                                