In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, LSTM, Dropout

#### Helper Functions

In [0]:
def preprocess_categorical(x_train, x_test, categorical):
    # Columns to be embedded: map to range [0, # values)
    for cat in categorical:
        raw_vals, val_map = np.unique(x_train[cat]), {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i  
        x_train.loc[:,cat] = x_train.loc[:,cat].map(val_map)
        x_test.loc[:,cat] = x_test.loc[:,cat].map(val_map) 
        
    return x_train, x_test

def prepare_data(x, y, categorical, numeric, steps):
    _x, _y = [], []
    columns = categorical + numeric
    for col in columns:
        inp = []
        for i in range(len(x) - steps): 
            v = x.iloc[i:(i + steps)][col].values.astype(np.float32)
            if col in numeric: 
                v = v.reshape(steps, 1)
            inp.append(v)
        inp = np.stack(inp, axis = 0)
        _x.append(inp)   
        
    for i in range(len(x) - steps): 
      _y.append(y.iloc[i + steps].values)
    _y = np.array(_y).astype(np.float32)
    
    return _x, _y

#### Data Processing

In [0]:
df = pd.read_csv("./California_SO2_Measures.csv")

# From date extract day, month, year for learning embeddings
df["Date"] = pd.to_datetime(df["Date"], infer_datetime_format=True)
df["Day"] = df["Date"].dt.day
df["Month"] = df["Date"].dt.month
df['lat/long'] = df['SITE_LATITUDE'].astype(
    str) + "-" + df['SITE_LONGITUDE'].astype(str)
df = df.drop(columns=['Source','Date', 'UNITS', 'AQS_PARAMETER_CODE', 
                      'SITE_LATITUDE', 'SITE_LONGITUDE'])

df.loc[:,"Site Name"] = df.loc[:,"Site Name"].astype("category").cat.codes
df.loc[:,"COUNTY"] = df.loc[:,"COUNTY"].astype("category").cat.codes
df.loc[:,"lat/long"] = df.loc[:,"lat/long"].astype("category").cat.codes

# Categorize columns by type for pre-processing
categorical =  ['Day','Month','Site ID','POC','Site Name', 'COUNTY', 'lat/long'] 
numeric = ['DAILY_AQI_VALUE', 'DAILY_OBS_COUNT', 'PERCENT_COMPLETE'] 
label = ['Daily Max 1-hour SO2 Concentration']

# Train/Test split based on months
train = df[df["Month"].isin([1,2,3,4,5,6,7,8,9])].copy(deep=True)
test = df[df["Month"].isin([9])].copy(deep=True)

# Separate features and label
train_x = train[categorical + numeric]
train_y = train[['Daily Max 1-hour SO2 Concentration']]

test_x = test[categorical + numeric]
test_y = test[['Daily Max 1-hour SO2 Concentration']]

num_scalers = {}
for num in numeric:
    scaler = StandardScaler()
    scaler.fit(train_x[num].values.reshape(-1, 1))
    train_x.iloc[:][num] = scaler.transform(
        train_x.iloc[:][num].values.reshape(-1, 1))
    test_x.iloc[:][num] = scaler.transform(
        test_x.iloc[:][num].values.reshape(-1, 1))
    num_scalers[num] = scaler

y_scaler = StandardScaler()
y_scaler.fit(train_y.values.reshape(-1, 1))
train_y.iloc[:][label] = y_scaler.transform(train_y.values.reshape(-1, 1))
test_y.iloc[:][label] = y_scaler.transform(test_y.values.reshape(-1, 1))

In [0]:
steps = 5 # hyper-parameter: window size
train_x, test_x = preprocess_categorical(train_x, test_x, categorical)
train_x, train_y = prepare_data(train_x, train_y, categorical, numeric, steps)
test_x, test_y = prepare_data(test_x, test_y, categorical, numeric, steps)

#### Model

In [0]:
inputs, embeddings = [], []

for cat in categorical:
    cat_input = Input(shape=(steps,), name="".join([cat.replace(" ", ""),"_inp"]))
    unique_cat  = train[cat].nunique()
    embedding_size = min(np.ceil((unique_cat)/2), 20)
    embedding_size = int(embedding_size)
    cat_dim = unique_cat + 1
    inputs.append(cat_input)
    embeddings.append(Embedding(cat_dim, embedding_size, input_length = steps,
            name="".join([cat.replace(" ", ""),"_emb"]))(cat_input))
for num in numeric:
    num_input = Input(shape=(steps,1), 
          name="".join([num.replace(" ", ""),"_inp"]))
    inputs.append(num_input)
    embeddings.append(num_input)
    
x = Concatenate(name="concat")(embeddings)
x = LSTM(128, kernel_regularizer=l2(0.0001), 
        recurrent_regularizer=l2(0.0001),
        return_sequences=False)(x)
x = Dense(64, activation="relu", kernel_regularizer=l2(0.0001))(x)
out = Dense(1, activation='linear', name="output")(x)
model = Model(inputs, out, name = "so2_model")
model.compile(optimizer = Adam(lr = 3e-4), loss=MeanSquaredError())
model.fit(train_x, train_y, epochs = 30, batch_size = 24, verbose = 2)

In [0]:
pred_y = model.predict(train_x)
train_mse = mean_squared_error(y_scaler.inverse_transform(train_y), 
                y_scaler.inverse_transform(pred_y), squared = False)
print("Train RMSE: ", train_mse)

pred_y = model.predict(test_x)
test_mse = mean_squared_error(y_scaler.inverse_transform(test_y), 
                y_scaler.inverse_transform(pred_y), squared = False)
print("Test RMSE: ", test_mse)