In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from math import radians, cos, sin, asin, sqrt

## Exploring the Challenger weather data

In [2]:
spark = SparkSession.builder.appName("Challenger weather analysis").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/16 20:31:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/16 20:31:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


stations.csv
- Station ID (Int)
- WBAN ID (Int)  "Weather Bureau Army Navy"
- GPS Latitude (Decimal)
- GPS Longitude (Decimal)

Stations are identified by the combination of their Station ID and WBAN ID.
Either of these IDs might be NULL.

In [3]:
stations = spark.read.csv('data/stations.csv', inferSchema=True, header=True)
stations.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- wban_id: integer (nullable = true)
 |-- gps_lat: double (nullable = true)
 |-- gps_long: double (nullable = true)



In [4]:
# drop rows with unspecified latitude
stations = stations.filter(
    stations.gps_lat.isNotNull() &
    stations.gps_long.isNotNull() &
    (stations.gps_lat != 0) &
    (stations.gps_long != 0)
).dropDuplicates()
stations.show()

[Stage 2:>                                                          (0 + 1) / 1]

+----------+-------+-------+--------+
|station_id|wban_id|gps_lat|gps_long|
+----------+-------+-------+--------+
|     13150|   null| 60.633|     6.1|
|     14710|   null|  60.05|    9.15|
|     20830|   null|   58.4|    19.2|
|     20870|   null| 57.767|  14.083|
|     22600|   null| 63.133|   18.25|
|     22870|   null|   63.8|  20.867|
|     25520|   null| 58.067|  15.233|
|     27330|   null| 61.033|  28.567|
|     28790|   null|   64.9|  29.017|
|     29660|   null| 60.896|  26.938|
|     31610|   null|   55.7|  -3.217|
|     33180|   null| 53.772|  -3.029|
|     35500|   null| 52.567|   0.717|
|     37690|   null| 51.233|  -0.383|
|     40831|   null|  66.35| -14.767|
|     41760|   null| 64.933| -15.783|
|     61240|   null| 55.017|  10.567|
|     75173|   null| 44.183|  -0.283|
|     80750|   null| 42.358|  -3.621|
|     85320|   null| 38.831|   -9.34|
+----------+-------+-------+--------+
only showing top 20 rows



                                                                                

Identify all weather stations within 100 km of Cape Canaveral using the haversine function (Note: not all of the stations necessarily recorded a temperature on any given day.)

In [5]:
# first, get haversine distance between any two specified lat-long pairs

# use cape canaveral's lat/long as base
cc_lat = 28.3922
cc_long = -80.6077

# haversine function borrowed from https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine(lon1, lat1, lon2=cc_long, lat2=cc_lat):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

# convert haversine() function into a spark UDF
haversine_udf = udf(lambda lat, long: haversine(lat, long))

In [9]:
# add column to df containing distance between each station and Cape Canaveral

# get distance between each lat-long pair and cape canaveral
haversine_col = haversine_udf(col('gps_lat'), col('gps_long'))
# add column to stations df containing distances
stations_cc_dist = stations.withColumn('cc_dist', haversine_col)
stations_cc_dist.show()

[Stage 10:>                                                         (0 + 1) / 1]

+----------+-------+-------+--------+------------------+
|station_id|wban_id|gps_lat|gps_long|           cc_dist|
+----------+-------+-------+--------+------------------+
|     13150|   null| 60.633|     6.1|14362.627898499035|
|     14710|   null|  60.05|    9.15|14076.082831990354|
|     20830|   null|   58.4|    19.2|13129.822797265268|
|     20870|   null| 57.767|  14.083|13507.373440055057|
|     22600|   null| 63.133|   18.25|13527.122096864574|
|     22870|   null|   63.8|  20.867|13336.543221927428|
|     25520|   null| 58.067|  15.233|13435.685110051569|
|     27330|   null| 61.033|  28.567| 12480.29795254318|
|     28790|   null|   64.9|  29.017|12652.946183934917|
|     29660|   null| 60.896|  26.938|12618.098342207893|
|     31610|   null|   55.7|  -3.217|14614.883145513937|
|     33180|   null| 53.772|  -3.029|14428.437965857225|
|     35500|   null| 52.567|   0.717|14074.894578695299|
|     37690|   null| 51.233|  -0.383|14027.929467386235|
|     40831|   null|  66.35| -1

                                                                                

In [10]:
# filter stations to retain only rows where distance is leq 100km
stations_near_cc = stations_cc_dist.filter(col('cc_dist') <= 1000)
stations_near_cc.show()

+----------+-------+-------+--------+------------------+
|station_id|wban_id|gps_lat|gps_long|           cc_dist|
+----------+-------+-------+--------+------------------+
|    895360|   null| -78.65|  35.633| 825.9674131550656|
|    895280|   null|-82.767|  28.583|212.09237546626963|
+----------+-------+-------+--------+------------------+





Create a new column in the dataframe using the UDF to calculate the distance between each station and the given lat-long pair.
Filter the dataframe to keep only the rows where the distance is less than or equal to 100km.

yyyy.csv (yyyy = year)
- StationID (Int)
- WBANID (Int)
- Month (Int)
- Day (Int)
- Temperature / degrees F (Decimal)

In [7]:
temps = spark.read.csv('data/1986.csv', inferSchema=True, header=True)
temps.printSchema()



root
 |-- station_id: integer (nullable = true)
 |-- wban_id: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- temp_f: double (nullable = true)



                                                                                

In [8]:
temps.show()

+----------+-------+-----+---+------+
|station_id|wban_id|month|day|temp_f|
+----------+-------+-----+---+------+
|     10010|   null|    1|  1|  17.2|
|     10010|   null|    1|  2|  12.1|
|     10010|   null|    1|  3|  10.4|
|     10010|   null|    1|  4|  17.4|
|     10010|   null|    1|  5|  26.5|
|     10010|   null|    1|  6|  30.1|
|     10010|   null|    1|  7|  29.7|
|     10010|   null|    1|  8|  29.6|
|     10010|   null|    1|  9|  29.6|
|     10010|   null|    1| 10|  33.0|
|     10010|   null|    1| 11|  32.5|
|     10010|   null|    1| 12|  27.4|
|     10010|   null|    1| 13|  22.2|
|     10010|   null|    1| 14|  11.3|
|     10010|   null|    1| 15|   2.5|
|     10010|   null|    1| 16|   3.0|
|     10010|   null|    1| 17|  13.4|
|     10010|   null|    1| 18|  29.8|
|     10010|   null|    1| 19|  27.5|
|     10010|   null|    1| 20|  25.2|
+----------+-------+-----+---+------+
only showing top 20 rows

