In [1]:
import pandas as pd
import numpy as np

In [2]:
# constants used in pre-processing

TEMP_COEF = 100

PRESS_SHIFT = 1000
PRESS_COEF = 100
PRESS_DEF = 1000

TIME_ZERO = pd.Timestamp('1970-01-01 00:00:00')
TIME_DELTA = '1h'

SEQ_LENGTH = 48
PERIOD_TO_PREDICT = 1

In [3]:
# functions for cleaning the data

def get_labels(data):
    """ returns the list of distinct labels in given data column """
    labels = list(set(data))
    return labels
    

def data_to_dicts(labels):
    """ returns pair of data to one-hot and one-hot to data dictionaries """
    data_to_oh = {x:tuple(1 if y == labels.index(x) else 0 
                    for y in range(len(labels))) 
                    for x in labels}
    
    oh_to_data = {y:x for x, y in data_to_oh.items()}
    
    return data_to_oh, oh_to_data


def normalize_temp(temp):
    return [float(t) / TEMP_COEF for t in temp]


def denormalize_temp(temp):
    return [t * TEMP_COEF for t in temp]


def normalize_press(press):
    press = [float(p) for p in press]
    for i in range(len(press)):
        if press[i] == 0:
            press[i] = press[i-1] if i != 0 else PRESS_DEF 

    return [(p - PRESS_SHIFT) / PRESS_COEF for p in press]


def denormalize_press(press):
    return [p * PRESS_COEF + PRESS_SHIFT for p in press]


def normalize_time(times):
    """ converts date-time data column to a UNIX-style int (number of TIME_DELTA steps since TIME_ZERO) """
    times = [pd.Timestamp(time[:-6]) for time in times]
    times = [((time - TIME_ZERO) // pd.Timedelta(TIME_DELTA)) for time in times]
    return times


# def denormalize_time(time):
# TODO


def one_hot_encode(data, data_to_oh):
    return [data_to_oh[d] for d in data]


def one_hot_decode(oh, oh_to_data):
    return [oh_to_data[o] for o in oh]

In [4]:
df = pd.read_csv("weatherHistory.csv", names = ['time', 'summary', 'precip', 'temp', 'app_temp', 'humidity', 'wind_speed', 'wind_bearing', 'visibility', 'loud_cover', 'pressure', 'daily_summary'], low_memory=False)

df = df.drop([0])
df = df.drop(['app_temp', 'wind_speed', 'wind_bearing', 'visibility', 'loud_cover', 'daily_summary'], axis=1) # TODO

df.set_index('time', inplace=True)
df.index = normalize_time(df.index)

df.head()

Unnamed: 0,summary,precip,temp,humidity,pressure
317736,Partly Cloudy,rain,9.47222222222222,0.89,1015.13
317737,Partly Cloudy,rain,9.355555555555558,0.86,1015.63
317738,Mostly Cloudy,rain,9.377777777777778,0.89,1015.94
317739,Partly Cloudy,rain,8.28888888888889,0.83,1016.41
317740,Mostly Cloudy,rain,8.755555555555553,0.83,1016.51


In [5]:
summary_labels = get_labels(df['summary'])
# print("len(summary_labels):", len(summary_labels))

# our training data contains nans when there is no precipitation
df['precip'] = df['precip'].fillna("clear")
precip_labels = get_labels(df['precip'])
# print("len(precip_labels):", len(precip_labels))

# daily_summary_labels = get_labels(df['daily_summary'])
# print("len(daily_summary_labels):", len(daily_summary_labels))


summary_to_oh, oh_to_summary = data_to_dicts(summary_labels)
precip_to_oh, oh_to_precip = data_to_dicts(precip_labels)

# print(summary_to_oh, oh_to_summary, sep='\n\n')
# print(precip_to_oh, oh_to_precip, sep='\n\n')

df['summary'] = one_hot_encode(df['summary'], summary_to_oh)
# df['summary'].head()
df['precip'] = one_hot_encode(df['precip'], precip_to_oh)
# df['precip'].head()

In [6]:
df['temp'] = normalize_temp(df['temp'])
df['pressure'] = normalize_press(df['pressure'])

# print(denormalize_temp(df['temp'])[:5])
# print(denormalize_press(df['pressure'])[:5])
# print(min(df['temp']), max(df['temp']), '\n', min(df['pressure']), max(df['pressure']))

In [7]:
# we shift values so that each row has a corresponding future row
for col in df.columns:
    df["future_{}".format(col)] = df["{}".format(col)].shift(-PERIOD_TO_PREDICT)

In [8]:
# separating data into test and training data (training data remains in df)

pct = df.index[-int(0.2 * len(df))]
test_data = df[(df.index >= pct)]
# test_data.head()

df = df[(df.index < pct)]
# train_data.head()

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, LSTM#, CuDNNLSTM

  from ._conv import register_converters as _register_converters


In [10]:
# constants used in the model

LSTM_LAYERS = 2
LSTM_UNITS = 128

FC_LAYERS = 1
FC_UNITS = 128

INPUT_DIM = (len(summary_labels) + len(precip_labels) + 3) * SEQ_LENGTH
OUTPUT_DIM = len(summary_labels) + len(precip_labels) + 3

In [11]:
# TODO

model = Sequential()

for i in range(LSTM_LAYERS):
    if i == 0:
        if i != LSTM_LAYERS - 1:
            model.add(LSTM(LSTM_UNITS, input_shape=(INPUT_DIM, SEQ_LENGTH), return_sequences=True, activation='tanh'))
        else:
            model.add(LSTM(LSTM_UNITS, input_shape=(INPUT_DIM, SEQ_LENGTH), return_sequences=False, activation='tanh'))
    else:
        if i != LSTM_LAYERS - 1:
            model.add(LSTM(LSTM_UNITS, return_sequences=True, activation='tanh'))
        else:
            model.add(LSTM(LSTM_UNITS, return_sequences=False, activation='tanh'))
    
for i in range(FC_LAYERS):
    model.add(Dense(FC_UNITS, activation='tanh'))

model.add(Dense(OUTPUT_DIM, activation='tanh'))
model.compile(optimizer='adam', loss='mean_square_error', metrics=['accuracy'])
model.fit(X, y, epochs = 1000, batch_size = 1, validation_data=)

SyntaxError: invalid syntax (<ipython-input-11-bf6c29431943>, line 22)