In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import cartopy

# Open files

## Observations

In [None]:
with open("observations", "r") as f:
    obs_df = pd.read_csv(f, index_col=0)

f.close()

In [None]:
obs_df

## Input parameter ranges

In [None]:
with open("inputs", "r") as f:
    inputs_df = pd.read_csv(f, index_col=0)

f.close()

In [None]:
inputs_df

## Actual parameter ranges

In [None]:
with open("model_variants", "r") as f:
    model_variants_df = pd.read_csv(f, index_col=0)

f.close()

In [None]:
model_variants_df

## Predictions

In [None]:
days = [str(n).zfill(2) for n in range(1, 15)]
times = ["09_20_00", "12_20_00"]

# Since the predictions take up so much space, they are separated by day
prediction_sets = ["predictions_07_" + day + "_17_" + time for day in days for time in times]

In [None]:
with open(prediction_sets[0], "r") as f:
    my_predict_df_this_time = pd.read_csv(f, index_col=0)

In [None]:
my_predict_df_this_time

# Identify outlier pixels

## Step 1. Distances and variances

In [None]:
def get_implausibility_from_least_squares_variant(obsSdCensor=0.1):
    """
    Value
    
    Tuple : Variant which achieves least squares between measured and emulated AOD, "Distances" (differences in response)
        and "variances" (terms needed to normalize the distances)
    """
    which_gets_least_squares = []
    distances = []
    variances = []
    
    my_obs_df = obs_df.copy()
    my_obs_df.loc[obs_df.sdResponse >= obsSdCensor, ["meanResponse", "sdResponse"]] = [float("nan"), float("nan")]

    # Get a best-variant for each day + time of day
    for time, prediction_set in zip(np.unique(my_obs_df.time), prediction_sets):
        
        my_obs_df_this_time = my_obs_df[my_obs_df.time==time].reset_index(drop=True)
        num_pixels = len(my_obs_df_this_time.index)
        
        with open(prediction_set, "r") as f:
            my_predict_df_this_time = pd.read_csv(f, index_col=0)
        
        my_predict_dfs = [
            my_predict_df_this_time.iloc[k*5000:(k+1)*5000, :].reset_index(drop=True) 
            for k in range(num_pixels)
        ]

        # Check which row (test variant) gives least squares
        for row in range(num_pixels):

            y = my_obs_df_this_time.loc[row, 'meanResponse']
            e = my_obs_df_this_time.loc[row, 'sdResponse']**2

            zs = my_predict_dfs[row]['meanResponse']
            ss = my_predict_dfs[row]['sdResponse']**2

            if ~np.isnan(y) and y != 0:
                squares = list((y - zs)**2 / (e + ss))
                least_squares = min(squares)
                idx = squares.index(least_squares)
                
                which_gets_least_squares.append(idx)
                distances.append(y-zs[idx])
                variances.append(e + ss[idx])
            else:
                which_gets_least_squares.append(0)
                distances.append(float("nan"))
                variances.append(float("nan"))

    return (which_gets_least_squares, distances, variances)

In [None]:
idxSmallest, distances, variances = get_implausibility_from_least_squares_variant()

In [None]:
leastSqs = [distances[k] / np.sqrt(variances[k]) for k in range(len(distances))]

## Step 2. Set threshold for outliers

In [None]:
plt.hist(leastSqs)

In [None]:
obs_df['missing'] = np.isnan(leastSqs)
obs_df['outlier'] = [leastSqs[k] > 15 for k in range(len(leastSqs))]

In [None]:
with open("outliers", "w") as f:
    obs_df.to_csv(f, index=False)

f.close()

## Step 3. View missingness / removed outliers

In [None]:
from matplotlib import ticker
from mpl_toolkits.axes_grid1 import make_axes_locatable

for time in np.unique(obs_df2.time)[0:1]:

    my_obs = obs_df2.loc[(obs_df2.time == time), :]
    
    BBox = [-45, 40, -30, 10]
    fig = plt.figure(figsize=(5.67*(2/3), 4), facecolor='white', dpi=1200)
    projection = ccrs.PlateCarree(central_longitude=0)

    ax = fig.add_subplot(1, 1, 1, projection=projection)
    ax.coastlines()
    ax.set_extent(BBox, ccrs.PlateCarree())
    gl = ax.gridlines(draw_labels=True, crs=projection, color='k', linewidth=0.5)
    gl.bottom_labels=False
    gl.right_labels=False
    gl.ylocator=ticker.FixedLocator([-20, -10, 0])
    gl.ylabel_style = {'size': 8}
    gl.xlocator=ticker.FixedLocator([-30, -10, 10, 30])
    gl.xlabel_style = {'size': 8}

    missing = my_obs.loc[my_obs.missing, :]
    outlier = my_obs.loc[my_obs.outlier, :]
    retained = my_obs.loc[(~my_obs.missing) & (~my_obs.outlier), :]

    p1 = plt.scatter(
        outlier.longitude,
        outlier.latitude,
        c='g',
        alpha=0.5,
        s=5,
        marker='s',
        label='Outlier'
    )
    p2 = plt.scatter(
        missing.longitude,
        missing.latitude,
        c='r',
        alpha=0.5,
        s=5,
        marker='s',
        label='Missing measurement'
    )
    p3 = plt.scatter(
        retained.longitude,
        retained.latitude,
        c=retained.meanResponse,
        cmap="Blues",
        alpha=1,
        s=5,
        marker='.',
        label='retained'
    )
    
    cbar = plt.colorbar(p3, fraction=0.1, orientation="horizontal")
    cbar.ax.tick_params(labelsize=8)
    cbar.set_label(label='Measured AOD', fontsize=8)

    plt.legend(
        handles=[p1, p2],
        loc='lower center',
        bbox_to_anchor=(0.5, -0.25),
        ncol=2,
        fontsize=8
    )

    plt.show()