In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder
    .appName("WeatherDataProcessing")
    .master("local[*]")
    .config("spark.hadoop.io.nativeio.use.windows.nativeio", "false")
    .getOrCreate()
)


In [3]:
spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [None]:
import boto3

In [9]:
from urllib.parse import quote
import os

input_path = os.path.join("data", "weather_sample.csv")

bucket = os.environ["WEATHER_BUCKET"]
key = os.environ["WEATHER_KEY"]

In [None]:
s3 = boto3.client("s3", region_name='eu-north-1')
s3.download_file(bucket, key, input_path)

In [14]:
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(input_path)
)

In [15]:
df.show(5)
df.printSchema()

+--------------------+-------------------+-------------------+-----------------+------------------+--------------------+-------------------+-----------------+----------------+----------------+----------------+--------------+-----------------+---------------+
|                name|          time_nano|          time_date|location_latitude|location_longitude|       location_name|weather_temperature|weather_feelsLike|weather_pressure|weather_humidity|weather_dewPoint|weather_clouds|weather_windSpeed|weather_windDeg|
+--------------------+-------------------+-------------------+-----------------+------------------+--------------------+-------------------+-----------------+----------------+----------------+----------------+--------------+-----------------+---------------+
|1248 - Tătărași S...|1648771200000000000|2022-04-01 02:00:00|           47.154|            27.614|Tătărași Sud, Iaș...|              16.54|            15.61|             995|              52|            6.66|             0

In [16]:
import pyspark.sql.functions as F

In [17]:
df = df.withColumn(
    "datetime_iso",
    F.date_format(
        F.to_timestamp("time_date", "yyyy-MM-dd HH:mm:ss"),
        "yyyy-MM-dd'T'HH:mm:ss"
    )
)

In [18]:
df = df.withColumn(
    "wind_speed_kmh",
    F.col("weather_windSpeed").cast("double") * F.lit(3.6)
)

In [19]:
result = df.select(
    "name",
    "datetime_iso",
    F.col("weather_temperature").alias("temperature_c"),
    "wind_speed_kmh"
)

In [20]:
result.show(5, truncate=False)

+----------------------------------+-------------------+-------------+--------------+
|name                              |datetime_iso       |temperature_c|wind_speed_kmh|
+----------------------------------+-------------------+-------------+--------------+
|1248 - Tătărași Sud, Iași, Romania|2022-04-01T02:00:00|16.54        |11.268        |
+----------------------------------+-------------------+-------------+--------------+



In [22]:
import os
os.makedirs("output", exist_ok=True)

pdf = result.toPandas()

output_file = "output/weather_1248_2022-04-01.csv"
pdf.to_csv(output_file, sep=";", index=False)

output_file

'output/weather_1248_2022-04-01.csv'