In [44]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
)

In [45]:
spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

schema = StructType(
    [
        StructField("stationID", StringType(), True),
        StructField("date", IntegerType(), True),
        StructField("measure_type", StringType(), True),
        StructField("temperature", FloatType(), True),
    ]
)

In [46]:
df = spark.read.schema(schema).csv("ml-32m/1800.csv")
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)



In [47]:
df.show(5)

+-----------+--------+------------+-----------+
|  stationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMAX|      -75.0|
|ITE00100554|18000101|        TMIN|     -148.0|
|GM000010962|18000101|        PRCP|        0.0|
|EZE00100082|18000101|        TMAX|      -86.0|
|EZE00100082|18000101|        TMIN|     -135.0|
+-----------+--------+------------+-----------+
only showing top 5 rows



In [48]:
minTemps = df.filter(df["measure_type"] == "TMIN")
stationTemps = minTemps.select("stationID", "temperature")
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



In [49]:
minTempsByStation = (
    stationTemps.groupBy("stationID")
    .min("temperature")
)
minTempsByStation.show(5)

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [50]:
for result in minTempsByStation.collect():
  print(f"{result[0]}: {result[1]:.2f}F")

ITE00100554: -148.00F
EZE00100082: -135.00F
