## Proposal benchmark model

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import time
import os

## Dataset preprocessing

In [13]:
# Set dataset locations
selected_output_variable =  "OZONE_AQI" # "NO2"
data_file = "01_Data/02_Imagery/data_and_imagery_test.pkl"

In [36]:
def process_data(data_file, selected_output_variable):
    """
    Load and reshape images into a numpy array
    Input:
        filepath to file with matched image and emission data
    Output:
        numpy array of shape (resolution * resolution * num_channels, num_images)
    """
    # open file
    data = pickle.load(open(data_file, 'rb'))
            
    # filter for output variable
    data = data[data['type'] == "OZONE_AQI"] #selected_output_variable]
    aqi = np.array(data['AQI_level'])
    unique_vals = np.unique(aqi)
    aqi_good = aqi[aqi=='good']
    aqi_mod = aqi[aqi=='moderate']
    print(len(aqi_good))
    print(len(aqi_mod))
    
    # get image dims
    m = len(data)
    res, num_channels = data['imagery'].iloc[0].shape[0], data['imagery'].iloc[0].shape[2]
    
    # get X data as np array and check dims
    images = np.array(data['imagery'].to_list())
    images = images.reshape(images.shape[0], -1).T
    assert(images.shape == (res*res*num_channels, m))
    
    # get y data as np array and check dims
    if "AQI" in selected_output_variable:        # Distinguish between preprocessing for classification and regression
        labels = data['AQI_level'].to_numpy().reshape(1,m)
    else:
        labels = data['value'].to_numpy().reshape(1,m)
    
    assert(labels.shape == (1,m))
    
    return images, labels, res, num_channels, m

## Train and evaluate model

In [37]:
# load data
print("Preparing data...")
print("\tdata filename: ", data_file)
print("\tvariable: ", selected_output_variable)
t0 = time.time()
X, y, _, _, _ = process_data(data_file, selected_output_variable)

X, y = X.T, np.ravel(y.T) # sklearn likes them transposed and a (m,) shape for y
print("\tsize of X: ", X.shape)
print("\tsize of Y: ", y.shape)

# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print('\tnumber of training samples: ', X_train.shape[0])
print('\tnumber of test samples: ', X_test.shape[0])
print("Done (%0.3fs)" %(time.time() - t0))

# Train model
t1 = time.time()
if "AQI" in selected_output_variable:
    print('\nTraining classifier...')
    model = MLPClassifier(random_state=1, max_iter=500).fit(X_train, y_train)
else:
    print('\nTraining regressor...')
    model = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
print('Done (%0.3fs)' %(time.time()-t1))

# Evaluate performance
r2 = model.score(X_test, y_test)
print('\nScore: ', r2)

Preparing data...
	data filename:  01_Data/02_Imagery/data_and_imagery_test.pkl
	variable:  OZONE_AQI
1431
9
	size of X:  (1440, 3468)
	size of Y:  (1440,)
	number of training samples:  1080
	number of test samples:  360
Done (0.098s)

Training classifier...
Done (3.153s)

Score:  0.9888888888888889
