In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder
    .appName("WeatherDataProcessing")
    .master("local[*]")
    .config("spark.hadoop.io.nativeio.use.windows.nativeio", "false")
    .getOrCreate()
)


In [3]:
spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [4]:
input_path = "data/levi9-hack9-weather-firehose-2-2022-04-01-00-20-43-f2bad984-6327-36c6-997f-4ce326fd1df7"

In [5]:
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(input_path)
)

In [6]:
df.show(5)
df.printSchema()

+--------------------+-------------------+-------------------+-----------------+------------------+--------------------+-------------------+-----------------+----------------+----------------+----------------+--------------+-----------------+---------------+
|                name|          time_nano|          time_date|location_latitude|location_longitude|       location_name|weather_temperature|weather_feelsLike|weather_pressure|weather_humidity|weather_dewPoint|weather_clouds|weather_windSpeed|weather_windDeg|
+--------------------+-------------------+-------------------+-----------------+------------------+--------------------+-------------------+-----------------+----------------+----------------+----------------+--------------+-----------------+---------------+
|1248 - Tătărași S...|1648771200000000000|2022-04-01 02:00:00|           47.154|            27.614|Tătărași Sud, Iaș...|              16.54|            15.61|             995|              52|            6.66|             0

In [7]:
import pyspark.sql.functions as F

In [8]:
df = df.withColumn(
    "datetime_iso",
    F.date_format(
        F.to_timestamp("time_date", "yyyy-MM-dd HH:mm:ss"),
        "yyyy-MM-dd'T'HH:mm:ss"
    )
)

In [9]:
df = df.withColumn(
    "wind_speed_kmh",
    F.col("weather_windSpeed").cast("double") * F.lit(3.6)
)

In [10]:
result = df.select(
    "name",
    "datetime_iso",
    F.col("weather_temperature").alias("temperature_c"),
    "wind_speed_kmh"
)

In [11]:
result.show(5, truncate=False)

+----------------------------------+-------------------+-------------+--------------+
|name                              |datetime_iso       |temperature_c|wind_speed_kmh|
+----------------------------------+-------------------+-------------+--------------+
|1248 - Tătărași Sud, Iași, Romania|2022-04-01T02:00:00|16.54        |11.268        |
+----------------------------------+-------------------+-------------+--------------+



In [13]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp39-cp39-win_amd64.whl (11.4 MB)
   ---------------------------------------- 0.0/11.4 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.4 MB 5.6 MB/s eta 0:00:02
   ------- -------------------------------- 2.1/11.4 MB 5.1 MB/s eta 0:00:02
   ----------- ---------------------------- 3.1/11.4 MB 5.0 MB/s eta 0:00:02
   -------------- ------------------------- 4.2/11.4 MB 4.9 MB/s eta 0:00:02
   ------------------ --------------------- 5.2/11.4 MB 4.9 MB/s eta 0:00:02
   --------------------- ------------------ 6.0/11.4 MB 4.9 MB/s eta 0:00:02
   ------------------------ --------------- 7.1/11.4 MB 4.8 MB/s eta 0:00:01
   ---------------------------- -----

In [12]:
output_path = "output/weather_1248_2022-04-01"

(
    result
    .write
    .mode("overwrite")
    .option("header", True)
    .option("delimiter", ";")
    .csv(output_path)
)


Py4JJavaError: An error occurred while calling o57.csv.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:288)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1218)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1423)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$26(FileFormatWriter.scala:277)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:277)
	... 42 more


In [14]:
import os
os.makedirs("output", exist_ok=True)

pdf = result.toPandas()

output_file = "output/weather_1248_2022-04-01.csv"
pdf.to_csv(output_file, sep=";", index=False)

output_file

'output/weather_1248_2022-04-01.csv'