# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [1]:
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
bucket_name = "BUCKET_NAME"
schema = StructType([
    StructField("station_id", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("measurement_type",StringType(),True),
    StructField("temperature",FloatType(),True)
])



Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: 84dac13e-d023-4fb3-bb4f-f7c267907fa7
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
Waiting for session 84dac13e-d023-4fb3-bb4f-f7c267907fa7 to get into ready status...
Session 84dac13e-d023-4fb3-bb4f-f7c267907fa7 has been created.



In [2]:
measures = spark.read.csv(bucket_name,schema=schema)
measures.show()

+-----------+--------+----------------+-----------+
| station_id|    date|measurement_type|temperature|
+-----------+--------+----------------+-----------+
|ITE00100554|18000101|            TMAX|      -75.0|
|ITE00100554|18000101|            TMIN|     -148.0|
|GM000010962|18000101|            PRCP|        0.0|
|EZE00100082|18000101|            TMAX|      -86.0|
|EZE00100082|18000101|            TMIN|     -135.0|
|ITE00100554|18000102|            TMAX|      -60.0|
|ITE00100554|18000102|            TMIN|     -125.0|
|GM000010962|18000102|            PRCP|        0.0|
|EZE00100082|18000102|            TMAX|      -44.0|
|EZE00100082|18000102|            TMIN|     -130.0|
|ITE00100554|18000103|            TMAX|      -23.0|
|ITE00100554|18000103|            TMIN|      -46.0|
|GM000010962|18000103|            PRCP|        4.0|
|EZE00100082|18000103|            TMAX|      -10.0|
|EZE00100082|18000103|            TMIN|      -73.0|
|ITE00100554|18000104|            TMAX|        0.0|
|ITE00100554

In [18]:
results_df = measures.filter(measures.measurement_type == "TMAX").groupBy("station_id").max("temperature")
results_df.show()

+-----------+----------------+
| station_id|max(temperature)|
+-----------+----------------+
|EZE00100082|           323.0|
|ITE00100554|           323.0|
+-----------+----------------+


In [20]:
results_df.withColumn("temperature_in_celcius",func.round((results_df["max(temperature)"] / 10),2)).show()

+-----------+----------------+----------------------+
| station_id|max(temperature)|temperature_in_celcius|
+-----------+----------------+----------------------+
|EZE00100082|           323.0|                  32.3|
|ITE00100554|           323.0|                  32.3|
+-----------+----------------+----------------------+


## We can do the same for minimum temperature to.

In [21]:
results_df = measures.filter(measures.measurement_type == "TMIN").groupBy("station_id").min("temperature")
results_df.show()

+-----------+----------------+
| station_id|min(temperature)|
+-----------+----------------+
|EZE00100082|          -135.0|
|ITE00100554|          -148.0|
+-----------+----------------+


In [22]:
results_df.withColumn("temperature_in_celcius",func.round((results_df["min(temperature)"] / 10),2)).show()

+-----------+----------------+----------------------+
| station_id|min(temperature)|temperature_in_celcius|
+-----------+----------------+----------------------+
|EZE00100082|          -135.0|                 -13.5|
|ITE00100554|          -148.0|                 -14.8|
+-----------+----------------+----------------------+
