In [1]:
from os import environ
from pyspark.sql import SparkSession, functions as f

In [2]:
from pyspark.sql.types import (StructField, StructType, 
                               IntegerType, StringType,
                              FloatType)

In [3]:
file_path = "file:///"+environ['DATA_LAKE']

In [4]:
fields = [
            StructField(name="station_id", dataType=StringType(), nullable=False),
            StructField(name="entitiy_id", dataType=IntegerType(), nullable=False),
            StructField(name="temp_type", dataType=StringType(), nullable=True),
            StructField(name="temp", dataType=FloatType(), nullable=True),
            StructField(name="col_5", dataType=StringType(), nullable=True),
            StructField(name="col_6", dataType=StringType(), nullable=True)
            ]

In [5]:
schema_stuct = StructType(fields)

In [6]:
spark = SparkSession.builder.appName("min_max_temp").getOrCreate()

In [7]:
temp_df = spark.read.schema(schema_stuct).format("csv").load(file_path+"1800.csv")

In [8]:
min_temps = temp_df.filter(temp_df.temp_type=="TMIN")
max_temps = temp_df.filter(temp_df.temp_type=="TMAX")

In [9]:
min_temp_station = min_temps.select(min_temps.station_id, min_temps.temp).groupBy(min_temps.station_id).min("temp").withColumnRenamed("min(temp)","min_temp")

In [10]:
min_temp_per_station = min_temp_station \
.withColumn("temperature", f.round(f.col("min_temp")*0.1*(9.0/5.0)+32.0,2)) \
.select("station_id","temperature") \
.sort("temperature") \
.collect()

In [11]:
for row in min_temp_per_station:
    print(row[0] + "\t{:.2f}F".format(row[1]))

ITE00100554	5.36F
EZE00100082	7.70F


In [12]:
spark.stop()