In [12]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark==3.5.0 -q
!pip install gcsfs -q

In [13]:
!wget https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.7/gcs-connector-hadoop3-2.2.7-shaded.jar -P /usr/local/lib/

--2025-01-24 16:14:55--  https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.7/gcs-connector-hadoop3-2.2.7-shaded.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33831577 (32M) [application/java-archive]
Saving to: ‘/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar.5’


2025-01-24 16:14:58 (14.0 MB/s) - ‘/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar.5’ saved [33831577/33831577]



In [26]:
!rm -rf /content/output/*

In [15]:
!gcloud auth application-default login


The environment variable [GOOGLE_APPLICATION_CREDENTIALS] is set to:
  [/content/.config/application_default_credentials.json]
Credentials will still be generated to the default location:
  [/content/.config/application_default_credentials.json]
To use these credentials, unset this environment variable before
running your application.

Do you want to continue (Y/n)?  Y

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=ruV1R4kNl9heplGz8c8iulxc8RC43s&prompt=consent&token_usage=remote&access_type=offline&code_challenge=w42DSLjhmhvlanV1yyFC2gOsM_D

In [24]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType
from pyspark.sql.functions import col, count, sum, lit, window, first, sqrt, pow, last, when
from pyspark.sql.window import Window
import time

#Create the Spark session with the GCS connector
spark = SparkSession.builder \
    .appName("GCSReadVehicles") \
    .config("spark.jars", "/usr/local/lib/gcs-connector-hadoop3-2.2.7-shaded.jar") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .getOrCreate()

# Path to the credentials file generated by gcloud
credential_path = "/content/.config/application_default_credentials.json"

# Set up the environment for credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_path

# Configure PySpark to access GCS
spark._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile", credential_path)

# GCS bucket path
bucket_path = "gs://edit-de-project-streaming-data/carris-vehicles/"

# Define schema
schema = StructType([
    StructField("bearing", LongType(), True),
    StructField("block_id", StringType(), True),
    StructField("current_status", StringType(), True),
    StructField("id", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("line_id", StringType(), True),
    StructField("lon", DoubleType(), True),
    StructField("pattern_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("schedule_relationship", StringType(), True),
    StructField("shift_id", StringType(), True),
    StructField("speed", DoubleType(), True),
    StructField("stop_id", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("trip_id", StringType(), True),
])

# 1st Query: Read data from GCS and write to the output/bronze directory
df = spark.readStream \
    .format("json") \
    .schema(schema) \
    .option("maxFilesPerTrigger", 5) \
    .load(bucket_path)

# Path to the folder where data will be written
bronze_output_path = "output/bronze"
#To save data in bucket
#bronze_output_path = "gs://edit-data-eng-project-group2/vehicles_data/"

# Ensure the timestamp column is in the correct format
df = df.withColumn("timestamp", col("timestamp").cast("timestamp"))

# Define a 2-minute window for aggregations
agg_distance = df.withWatermark("timestamp", "2 minutes") \
    .groupBy("id", window(col("timestamp"), "2 minutes")) \
    .agg(
        first("lat").alias("start_lat"),
        first("lon").alias("start_lon"),
        last("lat").alias("end_lat"),
        last("lon").alias("end_lon"),
        count("*").alias("data_points")
    )

# Calculate distance using the Euclidean distance formula between the first and last point in the window
agg_distance = agg_distance.withColumn(
    "distance",
    sqrt(
        pow(col("end_lat") - col("start_lat"), 2) + pow(col("end_lon") - col("start_lon"), 2)
    ) * lit(111)  # Simplified approximation (in km)
)

# Calculate the average speed (km/h) based on distance and time
agg_speed = agg_distance.withColumn(
    "average_speed_kmh",
    (col("distance") / lit(2 / 60))  # 2-minute time converted to hours
)

# Add logic to define current_stop or next_stop based on current_status
df = df.withColumn(
    "stop",
    when(col("current_status") == "stopped_at", col("stop_id"))
    .otherwise(lit("-1"))
)

df = df.withColumn(
    "next_stop",
    when(col("current_status") != "stopped_at", col("stop_id"))
    .otherwise(lit("-1"))
)

# Aggregate vehicle attributes in the 2-minute window
windowed_df = df.withWatermark("timestamp", "2 minutes") \
    .groupBy("id", window(col("timestamp"), "2 minutes")) \
    .agg(
        first("line_id").alias("line"),
        first("route_id").alias("route"),
        first("bearing").alias("direction"),
        first("stop").alias("current_stop"),
        first("next_stop").alias("next_stop"),
        count("*").alias("data_points")
    )

windowed_df = windowed_df.drop("data_points")

# Join speed and distance data with vehicle attributes
final_result = agg_speed.join(windowed_df, ["id", "window"], "inner")

final_result = final_result.drop("data_points")
final_result = final_result.drop("start_lat")
final_result = final_result.drop("start_lon")
final_result = final_result.drop("end_lat")
final_result = final_result.drop("end_lon")

# Write to the output/bronze directory
query_bronze = final_result.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", bronze_output_path) \
    .option("checkpointLocation", f"{bronze_output_path}/_checkpoint") \
    .start()

#To save data in bucket
#def write_to_gcs(batch_df, batch_id):
#    batch_df.write \
#        .mode("append") \
#        .format("parquet") \
#        .option("path", "gs://edit-data-eng-project-group2/vehicles_data/") \
#        .save()

#query_bronze = final_result.writeStream \
#    .outputMode("append") \
#    .foreachBatch(write_to_gcs) \
#    .option("checkpointLocation", "gs://edit-data-eng-project-group2/vehicles_data/_checkpoint") \
#    .trigger(processingTime="30 seconds") \
#    .start()


time.sleep(480)
query_bronze.stop()
# Wait until the streaming ends
#query_bronze.awaitTermination()


In [28]:
output_df = spark.read.format("parquet").load("/content/output/bronze")
output_df.show()

+---+------+--------+-----------------+----+-----+---------+---------+
| id|window|distance|average_speed_kmh|line|route|direction|next_stop|
+---+------+--------+-----------------+----+-----+---------+---------+
+---+------+--------+-----------------+----+-----+---------+---------+

