## Proposal benchmark model

In [41]:
import numpy as np
import pandas as pd
import pickle
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import os

## Dataset preprocessing

In [42]:
# Set dataset locations
selected_output_variable = "OZONE"
data_file = "01_Data/02_Imagery/data_and_imagery_test.pkl"

In [43]:
def process_data(data_file, selected_output_variable):
    """
    Load and reshape images into a numpy array
    Input:
        filepath to file with matched image and emission data
    Output:
        numpy array of shape (resolution * resolution * num_channels, num_images)
    """
    # open file
    data = pickle.load(open(data_file, 'rb'))
    
    # filter for output variable
    data = data[data['type'] == selected_output_variable]
    
    # get image dims
    m = len(data)
    res, num_channels = data['imagery'][0].shape[0], data['imagery'][0].shape[2]
    
    # get X data as np array and check dims
    images = np.array(data['imagery'].to_list())
    images = images.reshape(images.shape[0], -1).T
    assert(images.shape == (res*res*num_channels, m))
    
    # get y data as np array and check dims
    if "AQI" in selected_output_variable:        # Distinguish between preprocessing for classification and regression
        labels = data['AQI_level'].to_numpy().reshape(1,m)
    else:
        labels = data['value'].to_numpy().reshape(1,m)
    
    assert(labels.shape == (1,m))
    
    return images, labels, res, num_channels, m

## Train and evaluate model

In [44]:
# load data
X, y, _, _, _ = process_data(data_file, selected_output_variable)
X, y = X.T, np.ravel(y.T) # sklearn likes them transposed and a (m,) shape for y
print("size of X: ", X.shape)
print("size of Y: ", y.shape)

# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print('number of training samples: ', X_train.shape[0])
print('number of test samples: ', X_test.shape[0])

# Train model
if "AQI" in selected_output_variable:
    model = MLPClassifier(random_state=1, max_iter=500).fit(X_train, y_train)
else:
    model = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)

# Evaluate performance
model.score(X_test, y_test)

size of X:  (7, 3468)
size of Y:  (7,)
number of training samples:  5
number of test samples:  2


-10.496945859572934