In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates

# validation modules
import experiment
import metrics

# requires cleanair
sys.path.append("../containers")
from cleanair.models import ModelData





















































































In [2]:
# read experiment from file
name = 'laptop_test'
cluster = 'laptop'
experiment_dir = '../run_model/experiments/'
exp = experiment.experiment_from_dir(name, cluster)

In [4]:
# load the data_model objects
secret_fp = "../terraform/.secrets/db_secrets.json"
model_data_list = []
for data_id in exp.experiment_df.data_id.unique():
    data_dir = experiment_dir + exp.name + '/data/data' + str(data_id)
    model_data = ModelData(config_dir=data_dir, secretfile=secret_fp)
    model_data_list.append(model_data)

In [20]:
# get a list of model data objects with updated prediction columns
updated_preds = []


for index, row in exp.experiment_df.iterrows():
    row = exp.experiment_df.loc[index]
    pid = row['param_id']
    model = row['model_name']
    did = row['data_id']
#     y_test = np.load(exp.data_config[did]['y_test_fp'])
    Y_pred = np.load(row['y_pred_fp'])
    y_mean = Y_pred[0, :, 0]
    y_var = Y_pred[0, :, 1]
    
    # look at shapes
    data_config = exp.data_config[did]
    x_test = np.load(data_config['x_test_fp'])
    y_test = np.load(data_config['y_test_fp'])
    print("Xs :", x_test.shape)
    print("Ys :", y_test.shape)
    
    # update model_data
    normalised_df = model_data_list[did].normalised_pred_data_df.copy()
    print("normalised df shape:", normalised_df.shape)
    
    gb = normalised_df.dropna(axis=1).groupby(['measurement_start_utc', 'point_id'])
#     print(gb.filter(lambda x: x['lat'].count() == 4).count())
    
    predict_data_dict = model_data_list[did].get_pred_data_arrays(dropna=False).copy()
    print('pred data dict x', predict_data_dict['X'].shape)
    
    # Create new dataframe with predictions
    predict_df = pd.DataFrame(index=normalised_df.index)
    print("Pred index shape:", predict_df.shape)
    print("Y pred shape:", Y_pred.shape)
    print("Y mean shape:", np.reshape(y_mean, y_mean.shape[0]).shape)
    print("Y var shape:", y_var.shape)
    predict_df['predict_mean'] = np.reshape(y_mean, y_mean.shape[0])
    predict_df['predict_var'] = np.reshape(y_var, y_var.shape[0])
    predict_df['fit_start_time'] = exp.data_config[did]['pred_start_date']
    predict_df['tag'] = 0
    
    normalised_pred_data_df = pd.concat([model_data_list[did].normalised_pred_data_df, predict_df], axis=1, ignore_index=False)
    print(normalised_pred_data_df.sample(3))
    updated_preds.append(normalised_pred_data_df)
    

Xs : (3398, 4)
Ys : (3398, 1)
normalised df shape: (7776, 20)
pred data dict x (7776, 4)
Pred index shape: (7776, 0)
Y pred shape: (1, 3398, 2)
Y mean shape: (3398,)
Y var shape: (3398,)


ValueError: Length of values does not match length of index

In [None]:
i = 0
preds = updated_preds[i]
preds['measurement_start_utc'] = pd.to_datetime(preds['measurement_start_utc'])
gb = preds.groupby('point_id')
print(preds.columns)

In [None]:
def plot_pretty_timeseries(time, series, ax, sigmas=None, locator=dates.DayLocator(interval=1)):
    ax.plot_date(time, series, ',', linestyle='-')
    ax.xaxis.grid(True, which="major")
#     ax.xaxis.set_major_locator(locator)
#     ax.xaxis.set_major_formatter(dates.DateFormatter('%d\n%m'))
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
    
    if not sigmas is None:
        factor = 2
        y_min = series - factor*np.sqrt(sigmas)
        y_max = series + factor*np.sqrt(sigmas)
        ax.fill_between(time, y_min, y_max, color='orange', alpha=0.5)
        
        
    return ax

In [None]:
count = 0
for point, row in gb:
    fig0, ax = plt.subplots()
    times = row['measurement_start_utc']
    means = row['predict_mean']
    sigmas = row['predict_var']
    print("times:", times.shape)
    print("means:", means.shape)
    print("sigmas:", sigmas.shape)
    locator = dates.HourLocator(interval=6)
    plot_pretty_timeseries(times, means, ax, sigmas=sigmas, locator=locator)
    plot_pretty_timeseries(times, row['NO2'], ax, locator=locator)
    plt.show()
    count += 1
    if count > 10:
        break

In [None]:
n = 2 # number of rows
c = 2 # number of columns
fig, axs = plt.subplots(n, c, sharex=True, sharey=True)
i = 0
j = 0
for point, row in gb:
    print(point, row['predict_mean'])
    plot_pretty_timeseries(row['measurement_start_utc'], row['predict_mean'], axs[i, j], sigmas=row['predict_var'], locator=dates.HourLocator(interval=6))
    plot_pretty_timeseries(row['measurement_start_utc'], row['NO2'], axs[i, j], locator=dates.HourLocator(interval=6))
    i += j % c
    j += 1
    j = j % c
    if i == n:
        break