# Weather Data

This notebook was loaded with:

```bash
PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./dse/bin/dse pyspark --num-executors 5 --driver-memory 6g --executor-memory 6g
```

The general plan is to do some exploration and cleaning in jupyter notebooks, then run our actual models by submitting python scripts and letting the jobs run as we'd expect.

We'll clean the data and load them into cassandra tables to be used by the jobs.

In [1]:
%pylab inline 

Populating the interactive namespace from numpy and matplotlib


In [19]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import max, min, isnull, count, datediff, lag, avg, sum, coalesce, rank, lit, when,col, udf, to_date, year, mean, month, date_format, array
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.ml.feature import StringIndexer
from datetime import datetime
from pyspark.sql.window import Window
import pyspark
import matplotlib
import matplotlib.pyplot as plt
from datetime import date, timedelta

We already explored the weather. We'll just clean it up by taking the weather station we thought was most important and complete, and create a 3 and 5 day rolling weather average.

In [3]:
import pandas as pd
# load climate dataset
df_climate = pd.read_csv('data/climate835468.csv')

# format datetime string
#df_climate['DATE'] =  pd.to_datetime(df_climate['DATE'], format='%Y%m%d')

# extract rows for Northerly Island station, drop missing data
df_northerlyisland = df_climate[(df_climate['STATION_NAME'] == 'CHICAGO NORTHERLY ISLAND IL US') & (df_climate['TMAX'] != -9999) & (df_climate['TMIN'] != -9999)]

# drop all columns except date and temperature
df_temperature = df_northerlyisland[['DATE', 'TMAX', 'TMIN']]


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_climate.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,PRCP,SNWD,SNOW,TAVG,TMAX,TMIN,AWND
0,GHCND:US1ILCK0148,OAK LAWN 1.9 SE IL US,182.3,41.6936,-87.729,20100124,0.34,-9999.0,-9999.0,-9999,-9999,-9999,-9999.0
1,GHCND:US1ILCK0148,OAK LAWN 1.9 SE IL US,182.3,41.6936,-87.729,20100308,0.1,-9999.0,-9999.0,-9999,-9999,-9999,-9999.0
2,GHCND:US1ILCK0148,OAK LAWN 1.9 SE IL US,182.3,41.6936,-87.729,20100310,0.08,-9999.0,-9999.0,-9999,-9999,-9999,-9999.0
3,GHCND:US1ILCK0148,OAK LAWN 1.9 SE IL US,182.3,41.6936,-87.729,20100311,0.03,-9999.0,-9999.0,-9999,-9999,-9999,-9999.0
4,GHCND:US1ILCK0148,OAK LAWN 1.9 SE IL US,182.3,41.6936,-87.729,20100312,0.26,-9999.0,-9999.0,-9999,-9999,-9999,-9999.0


In [5]:
df_temperature.dtypes

DATE    int64
TMAX    int64
TMIN    int64
dtype: object

In [6]:
df_temperature["DATE"] = df_temperature["DATE"].astype(str)
df_temperature.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


DATE    object
TMAX     int64
TMIN     int64
dtype: object

From data exploration, we noted that high temperature appears to be associated with failed outcome. Here, we will extract daily maximum (TMAX) and minimum temperature (TMIN) recorded at CHICAGO NORTHERLY ISLAND station, which represents the most comprehensive record from years 2010 to present and is located closest to the city center.

In [7]:
df = sqlContext.createDataFrame(df_temperature)

In [8]:
string2Date = udf (lambda s: datetime.strptime(s, '%Y%m%d'), DateType())
df = df.withColumn("DATE", string2Date(df["DATE"]))

Now, we'll create some rolling averages...

In [11]:
df.filter((col("DATE")>='2011-05-28') & (col("DATE")<'2011-06-10')).collect()

[Row(DATE=datetime.date(2011, 6, 4), TMAX=91, TMIN=64),
 Row(DATE=datetime.date(2011, 6, 5), TMAX=77, TMIN=59),
 Row(DATE=datetime.date(2011, 6, 6), TMAX=94, TMIN=60),
 Row(DATE=datetime.date(2011, 6, 7), TMAX=97, TMIN=80),
 Row(DATE=datetime.date(2011, 6, 8), TMAX=96, TMIN=70),
 Row(DATE=datetime.date(2011, 6, 9), TMAX=70, TMIN=52)]

In [12]:
df.filter((col("DATE")=='2011-06-03') | (col("DATE")=='2010-06-03') | (col("DATE")=='2012-06-03')| (col("DATE")=='2013-06-03') | (col("DATE")=='2014-06-03') | (col("DATE")=='2015-06-03') | (col("DATE")=='2010-06-03')).collect()

[Row(DATE=datetime.date(2010, 6, 3), TMAX=67, TMIN=59),
 Row(DATE=datetime.date(2012, 6, 3), TMAX=81, TMIN=61),
 Row(DATE=datetime.date(2013, 6, 3), TMAX=59, TMIN=48),
 Row(DATE=datetime.date(2014, 6, 3), TMAX=76, TMIN=63),
 Row(DATE=datetime.date(2015, 6, 3), TMAX=64, TMIN=52)]

So, we'll have to make a judgement call. It looks like the days surrounding the missing day might be better than what the same date was year over year. That makes sense intuitively. So, we'll use KNN to fill in the missing dates with values.

In [17]:
df.agg(min(col("DATE"))).collect()

[Row(min(DATE)=datetime.date(2010, 1, 1))]

In [20]:
df.agg(max(col("DATE"))).collect()

[Row(max(DATE)=datetime.date(2016, 10, 31))]

In [123]:
start = date(2010, 1, 1)
end = date(2016, 10, 31)

In [165]:
d = df.select("DATE").orderBy("DATE").toPandas()

In [166]:
d["DATE"] = pd.to_datetime(d["DATE"])

In [167]:
d.dtypes

DATE    datetime64[ns]
dtype: object

In [178]:
d2 = set(date(x.year, x.month, x.day) for x in d["DATE"])

In [193]:
date_set = set(start + timedelta(x) for x in range((end - start).days))

In [214]:
missing = sorted(date_set - d2)
missing

[datetime.date(2010, 12, 31),
 datetime.date(2011, 3, 19),
 datetime.date(2011, 3, 20),
 datetime.date(2011, 3, 22),
 datetime.date(2011, 3, 23),
 datetime.date(2011, 3, 24),
 datetime.date(2011, 3, 25),
 datetime.date(2011, 3, 26),
 datetime.date(2011, 3, 27),
 datetime.date(2011, 4, 12),
 datetime.date(2011, 5, 25),
 datetime.date(2011, 5, 26),
 datetime.date(2011, 5, 28),
 datetime.date(2011, 5, 29),
 datetime.date(2011, 5, 30),
 datetime.date(2011, 5, 31),
 datetime.date(2011, 6, 1),
 datetime.date(2011, 6, 2),
 datetime.date(2011, 6, 3),
 datetime.date(2011, 8, 23),
 datetime.date(2011, 8, 24),
 datetime.date(2012, 4, 21),
 datetime.date(2013, 5, 23),
 datetime.date(2013, 5, 24),
 datetime.date(2013, 10, 2),
 datetime.date(2013, 10, 3),
 datetime.date(2014, 7, 24),
 datetime.date(2014, 7, 25),
 datetime.date(2014, 9, 9),
 datetime.date(2016, 7, 9),
 datetime.date(2016, 7, 10),
 datetime.date(2016, 7, 11),
 datetime.date(2016, 7, 12),
 datetime.date(2016, 10, 1),
 datetime.date(201

In [217]:
missing_int = [10000*x.year + 100*x.month + x.day for x in missing]

In [219]:
#add the days to python
missing = pd.DataFrame(missing)

In [220]:
missing.columns=["DATE"]

In [222]:
missing["dv"] = missing_int

In [223]:
missing.head()

Unnamed: 0,DATE,dv
0,2010-12-31,20101231
1,2011-03-19,20110319
2,2011-03-20,20110320
3,2011-03-22,20110322
4,2011-03-23,20110323


In [224]:
from sklearn.neighbors import KNeighborsRegressor as KNN

#this gives us something for the model to predict. It doesn't matter that they are all labels.
knn = KNN(n_neighbors=3)

In [200]:
df_train = df.orderBy("DATE").toPandas()

In [231]:
d2 = set(date(x.year, x.month, x.day) for x in df_train["DATE"])
df_train["dv"] = [10000*x.year + 100*x.month + x.day for x in d2]

In [234]:
df_train.head

<bound method DataFrame.head of             DATE  TMAX  TMIN        dv
0     2010-01-01    21    10  20141124
1     2010-01-02    16     7  20100501
2     2010-01-03    24     6  20100808
3     2010-01-04    21    13  20110807
4     2010-01-05    27    19  20161027
5     2010-01-06    25    15  20150910
6     2010-01-07    25    18  20151009
7     2010-01-08    31    20  20160223
8     2010-01-09    26    15  20121004
9     2010-01-10    21     6  20120904
10    2010-01-11    29    20  20160131
11    2010-01-12    33    23  20160715
12    2010-01-13    37    22  20100901
13    2010-01-14    43    33  20100310
14    2010-01-15    36    27  20150729
15    2010-01-16    30    26  20111112
16    2010-01-17    36    25  20110103
17    2010-01-18    33    29  20110216
18    2010-01-19    38    30  20130108
19    2010-01-20    34    31  20150301
20    2010-01-21    36    31  20140505
21    2010-01-22    36    34  20100406
22    2010-01-23    39    34  20120130
23    2010-01-24    48    33  20

In [238]:
fit_knn = knn.fit(df_train["dv"].values.reshape(-1,1), df_train["TMAX"].values)

max_vals = pd.DataFrame(fit_knn.predict(missing["dv"].values.reshape(-1,1)))

In [240]:
missing["TMAX"] = max_vals

In [241]:
fit_knn = knn.fit(df_train["dv"].values.reshape(-1,1), df_train["TMIN"].values)

min_vals = pd.DataFrame(fit_knn.predict(missing["dv"].values.reshape(-1,1)))

In [242]:
missing["TMIN"] = min_vals

In [245]:
del missing["dv"]
del df_train["dv"]

In [246]:
combined = pd.concat([df_train, missing])

In [250]:
df = sqlContext.createDataFrame(combined)

In [252]:
df = df.withColumn("TMAX_3", avg("TMAX").over(Window.orderBy("DATE").rowsBetween(-3,0)))
df = df.withColumn("TMAX_5", avg("TMAX").over(Window.orderBy("DATE").rowsBetween(-5,0)))

In [253]:
df.head(10)

[Row(DATE=datetime.date(2010, 1, 1), TMAX=21.0, TMIN=10.0, TMAX_3=21.0, TMAX_5=21.0),
 Row(DATE=datetime.date(2010, 1, 2), TMAX=16.0, TMIN=7.0, TMAX_3=18.5, TMAX_5=18.5),
 Row(DATE=datetime.date(2010, 1, 3), TMAX=24.0, TMIN=6.0, TMAX_3=20.333333333333332, TMAX_5=20.333333333333332),
 Row(DATE=datetime.date(2010, 1, 4), TMAX=21.0, TMIN=13.0, TMAX_3=20.5, TMAX_5=20.5),
 Row(DATE=datetime.date(2010, 1, 5), TMAX=27.0, TMIN=19.0, TMAX_3=22.0, TMAX_5=21.8),
 Row(DATE=datetime.date(2010, 1, 6), TMAX=25.0, TMIN=15.0, TMAX_3=24.25, TMAX_5=22.333333333333332),
 Row(DATE=datetime.date(2010, 1, 7), TMAX=25.0, TMIN=18.0, TMAX_3=24.5, TMAX_5=23.0),
 Row(DATE=datetime.date(2010, 1, 8), TMAX=31.0, TMIN=20.0, TMAX_3=27.0, TMAX_5=25.5),
 Row(DATE=datetime.date(2010, 1, 9), TMAX=26.0, TMIN=15.0, TMAX_3=26.75, TMAX_5=25.833333333333332),
 Row(DATE=datetime.date(2010, 1, 10), TMAX=21.0, TMIN=6.0, TMAX_3=25.75, TMAX_5=25.833333333333332)]

In [254]:
df.dtypes

[('DATE', 'date'),
 ('TMAX', 'double'),
 ('TMIN', 'double'),
 ('TMAX_3', 'double'),
 ('TMAX_5', 'double')]

```cql
CREATE  KEYSPACE chicago_data 
   WITH REPLICATION = {'class' : 'SimpleStrategy', 'replication_factor' : 1};
```

```cql
CREATE TABLE chicago_data.temperature (
    "DATE" date,
    "TMAX" int,
    "TMIN" int,
    "TMAX_3" double,
    "TMAX_5" double,
    PRIMARY KEY ("DATE"));
```

In [257]:
 df.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="temperature", keyspace="chicago_data")\
    .save()

In [258]:
df.count()

2496