# Adjustment Analysis

This notebook contains experiments for analyzing the drivers of whether privacy adjusted forecasts are better or worse than their original counterparts.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import os

from sktime.performance_metrics.forecasting import mean_absolute_error
from sktime.utils.plotting import plot_series

from tsfeatures import tsfeatures

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.inspection import permutation_importance
from sklearn.tree import plot_tree

import graphviz 

Start with defining the path where results will be saved, and the path to the forecast files. 

In [2]:
# results file path
results_path = "../../Outputs/Results/"
# forecasts file path
forecasts_path = "../../Outputs/Forecasts/"
# names of forecast files
fcast_files = os.listdir(forecasts_path)

Read in the train and test data, and the k-nts+ (k=3) data for $h = 1$.

In [3]:
train_data = pd.read_csv("../../Data/Train/Clean/m3_monthly_micro_h1.csv", header=None, skiprows=1)
protected_data = pd.read_csv("../../Data/Train/Clean/protected_m3_monthly_micro_h1_k-nts-plus_3.csv", header=None, skiprows=1)

In [4]:
# convert to a list of series, drop missing values
train_data = [x.dropna() for _, x in train_data.iterrows()]
protected_data = [x.dropna() for _, x in protected_data.iterrows()]
test_data = pd.read_csv(forecasts_path + "Test_h1.csv")

# import original time series features
# features = pd.read_csv("../../Data/Train/Clean/tsfeatures/tsfeatures_h1.csv")

Set up simple linear regression to estimate the slope of the time series.

In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
slopes = []
for i,j in enumerate(train_data):
    x = np.array(np.arange(0, j.shape[0])).reshape(-1,1)
    model = LinearRegression().fit(x, j)
    slopes.append(model.coef_[0])

Calculate coefficients of variation for the original and protected data sets.
Compute the bins.

In [7]:
coef_vars = np.array([np.std(x, ddof=1)/np.mean(x) for x in train_data]).reshape(-1,1)
protected_coef_vars = np.array([np.std(x, ddof=1)/np.mean(x) for x in protected_data]).reshape(-1,1)

# calculate the percentage change in coefficient of variation
change_in_coef = (protected_coef_vars - coef_vars)/coef_vars

***

# Import Time Series Characteristics

Now, we write a loop to read in the time series features, the original forecasts, and the protected forecasts for each model and dataset. We calculate whether the adjustment was valuable based on the forecast errors, add this to the features, and combine all of these dataframes into one.

In [8]:
fcast_files = [f for f in fcast_files if "h1_" in f]

In [9]:
fcast_files = [f for f in fcast_files if "h1_original" in f or "k_nts_plus_3" in f]

In [10]:
fcast_files

['ARIMA_h1_k_nts_plus_3.csv',
 'ARIMA_h1_original.csv',
 'DES_h1_k_nts_plus_3.csv',
 'DES_h1_original.csv',
 'Multivariate_LGBM_h1_k_nts_plus_3.csv',
 'Multivariate_LGBM_h1_original.csv',
 'RNN_h1_k_nts_plus_3.csv',
 'RNN_h1_original.csv',
 'SES_h1_k_nts_plus_3.csv',
 'SES_h1_original.csv',
 'TES_h1_k_nts_plus_3.csv',
 'TES_h1_original.csv',
 'VAR_h1_k_nts_plus_3.csv',
 'VAR_h1_original.csv']

In [11]:
models = ["SES", "DES", "TES", "ARIMA", "VAR", "Multivariate_LGBM", "RNN"]

In [12]:
def binner(bin_var):
    high_q = np.quantile(bin_var, 3/4)
    med_q = np.quantile(bin_var, 1/4)
    new_var = []
    for i,j in enumerate(bin_var):
        if j >= high_q:
            new_var.append("High")
        elif j > med_q:
            new_var.append("Medium")
        else:
            new_var.append("Low")
            
    return pd.Series(new_var)

In [13]:
series_means = [np.mean(x) for x in train_data]

In [14]:
full_data = []

overall_valuable = []

# for each file with time series features
for m in models:
    # get the name of the file with the protected forecasts for the model
    protected_fcast_file = [f for f in fcast_files if m in f and "original" not in f]
    original_fcast_file = [f for f in fcast_files if m in f and "original" in f]
    [protected_fcast_file] = protected_fcast_file
    [original_fcast_file] = original_fcast_file
    
    # import the privacy adjusted forecasts
    fcasts_protected = pd.read_csv(forecasts_path + protected_fcast_file)
    # import the original forecasts
    fcasts_original = pd.read_csv(forecasts_path + original_fcast_file)
    
    # calculate original MAE
    original_errors = mean_absolute_error(test_data, fcasts_original, multioutput="raw_values")
    # calculate protected MAE
    protected_errors = mean_absolute_error(test_data, fcasts_protected, multioutput="raw_values")
    
    # measure average relative mean absolute error
    r = protected_errors/original_errors
    log_mae_ratio = np.log(r).reshape(-1,1)
    
    # calculate whether the adjustment improved accuracy
    valuable = (protected_errors < original_errors) * 1.0
    adjustment_direction = (fcasts_protected > fcasts_original) * 1.0
    valuable = valuable.reshape(-1,1)
    adjustment_direction = np.array(adjustment_direction).reshape(-1,1)
    
    overall_valuable.append(valuable)
    
    # Calculate adjustment magnitude (absolute difference in forecasts), normalized by the mean of the series.
    magnitudes = (fcasts_protected - fcasts_original).abs().T.divide(series_means, axis=0)
    
    combined = np.concatenate([valuable,
                               log_mae_ratio,
                               adjustment_direction,
                               coef_vars,
                               change_in_coef,
                               magnitudes], axis=1)
    
    combined = pd.DataFrame(combined)
    
    combined['model_name'] = m
    combined['slope'] = slopes
    
    combined.columns = ["is_valuable",
                        "log_mae_ratio",
                        "positive_adjustment",
                        "coef_var",
                        "change_in_coef",
                        "magnitudes",
                        "model_name",
                        "slope"]
    
    full_data.append(combined)

In [15]:
full_data = pd.concat(full_data, axis=0, ignore_index=True)

In [16]:
trim_amount = int(np.floor(0.05*len(full_data.log_mae_ratio)))
sorted_ratio = np.sort(full_data.log_mae_ratio)

lower = sorted_ratio[trim_amount-1]
upper = sorted_ratio[-trim_amount]

In [17]:
indexes_to_keep = [i for i,j in enumerate(full_data.log_mae_ratio) if j < upper and j > lower]

In [18]:
full_data = full_data.iloc[indexes_to_keep,:]
full_data = full_data.reset_index(drop=True)

In [19]:
bin_change_in_coef = []

for i, j in enumerate(full_data.change_in_coef):
    if j < -0.05:
        bin_change_in_coef.append("Decreased")
    elif j > 0.05:
        bin_change_in_coef.append("Increased")
    else:
        bin_change_in_coef.append("Maintained")
        
full_data["bin_change_in_coef"] = bin_change_in_coef

In [20]:
full_data['magnitude_bins'] = binner(full_data.magnitudes)

In [21]:
full_data['coef_bins'] = binner(full_data.coef_var)

In [22]:
def AvgRelMAE(ratios):
    return np.exp((1/len(ratios))*np.sum(ratios))

43% of privacy adjustments were valuable (less than random chance, tends to make forecasts worse)

In [23]:
np.mean(full_data.is_valuable)

0.4250334672021419

Tabulate the percentage of valuable adjustments by adjustment direction and magnitude. 

In [24]:
pd.pivot_table(data=full_data, values="is_valuable", index="magnitude_bins", columns="positive_adjustment", aggfunc=np.mean, margins=True)

positive_adjustment,0.0,1.0,All
magnitude_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,0.303867,0.405195,0.356091
Low,0.467742,0.490667,0.47925
Medium,0.418639,0.443765,0.432396
All,0.402128,0.445501,0.425033


In [25]:
pd.pivot_table(data=full_data, values="log_mae_ratio", index="magnitude_bins", columns="positive_adjustment", aggfunc=AvgRelMAE, margins=True)

positive_adjustment,0.0,1.0,All
magnitude_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,1.470395,1.351251,1.407732
Low,1.060349,0.99329,1.026137
Medium,1.166326,1.1236,1.142735
All,1.207081,1.141401,1.171937


In [26]:
pd.pivot_table(data=full_data, values="is_valuable", index="coef_bins", columns="bin_change_in_coef", aggfunc=np.mean, margins=True)

bin_change_in_coef,Decreased,Increased,Maintained,All
coef_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,0.490566,0.447368,0.405172,0.461847
Low,0.382353,0.360502,0.315789,0.356952
Medium,0.425406,0.491416,0.402857,0.440723
All,0.450337,0.416813,0.393617,0.425033


In [27]:
pd.pivot_table(data=full_data, values="log_mae_ratio", index="coef_bins", columns="bin_change_in_coef", aggfunc=AvgRelMAE, margins=True)

bin_change_in_coef,Decreased,Increased,Maintained,All
coef_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,1.058511,1.298189,1.13599,1.093283
Low,1.152134,1.267741,1.361529,1.27143
Medium,1.18428,1.118152,1.191335,1.164851
All,1.131188,1.205376,1.189729,1.171937


Look at whether forecasts improved as a function of positive/negative adjustment and linearity of the series.

In [28]:
full_data['positive_slope'] = full_data.slope > 0

In [29]:
pd.pivot_table(data=full_data, values="is_valuable", index="positive_slope", columns="positive_adjustment", aggfunc=np.mean, margins=True)

positive_adjustment,0.0,1.0,All
positive_slope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.395833,0.450163,0.426282
True,0.415556,0.429379,0.421642
All,0.402128,0.445501,0.425033


In [30]:
pd.pivot_table(data=full_data, values="log_mae_ratio", index="positive_slope", columns="positive_adjustment", aggfunc=AvgRelMAE, margins=True)

positive_adjustment,0.0,1.0,All
positive_slope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,1.239876,1.143497,1.184902
True,1.13999,1.134182,1.137429
All,1.207081,1.141401,1.171937


In [31]:
pd.crosstab(full_data.positive_slope, full_data.positive_adjustment, margins=True)/ full_data.shape[0]

positive_adjustment,0.0,1.0,All
positive_slope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.321285,0.409639,0.730924
True,0.150602,0.118474,0.269076
All,0.471888,0.528112,1.0


In [32]:
pd.crosstab(full_data.coef_bins, full_data.bin_change_in_coef, margins=True)/ full_data.shape[0]

bin_change_in_coef,Decreased,Increased,Maintained,All
coef_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,0.159639,0.012718,0.077644,0.25
Low,0.011379,0.213521,0.025435,0.250335
Medium,0.226573,0.155957,0.117135,0.499665
All,0.39759,0.382195,0.220214,1.0


Measure coefficient of variation for differentially private data with $\epsilon = 10$.

In [33]:
dp_protected = pd.read_csv("../../Data/Train/Clean/protected_m3_monthly_micro_h1_DP_10.csv", header=None, skiprows=1)

In [34]:
dp_protected = [x.dropna() for _, x in dp_protected.iterrows()]

In [35]:
dp_protected = [pd.Series([i if i >= 1 else 1 for i in x]) for x in dp_protected]

In [36]:
dp_coef_vars = np.array([np.std(x, ddof=1)/np.mean(x) for x in dp_protected]).reshape(-1,1)

# calculate the percentage change in coefficient of variation
dp_change_in_coef = (dp_coef_vars - coef_vars)/coef_vars

In [37]:
avg_dp_coef = np.mean(dp_coef_vars)

In [38]:
avg_knts_coef = np.mean(protected_coef_vars)

In [39]:
avg_coef = np.mean(coef_vars)

In [40]:
avg_dp_coef/avg_coef

1.1774016397781006

In [41]:
avg_knts_coef/avg_coef

0.9820569178236696

***

In [42]:
an_protected = pd.read_csv("../../Data/Train/Clean/protected_m3_monthly_micro_h1_AN_1.csv", header=None, skiprows=1)

In [43]:
an_protected = [x.dropna() for _, x in an_protected.iterrows()]

In [44]:
an_protected = [pd.Series([i if i >= 1 else 1 for i in x]) for x in an_protected]

In [45]:
an_coef_vars = np.array([np.std(x, ddof=1)/np.mean(x) for x in an_protected]).reshape(-1,1)

# calculate the percentage change in coefficient of variation
an_change_in_coef = (an_coef_vars - coef_vars)/coef_vars

In [46]:
avg_an_coef = np.mean(an_coef_vars)

In [47]:
avg_an_coef/avg_coef

1.3452964457421748

# Calculate Forecast Characteristics

We want to calculate adjustment variables for each protection method across all models and protection parameters.

We will need to loop through forecast files.

In [None]:
protection_methods = {"Top": [0.1, 0.2, 0.4],
                      "Bottom": [0.1, 0.2, 0.4], 
                      "AN": [0.5, 1, 1.5, 2],
                      "DP": [0.1, 1, 4.6, 10, 20]}

# Need to concatenate dataframes so index will line up.

In [None]:
forecast_characteristics = []

# for each model
for model in models:
    
    # import the original forecasts
    original_file = [f for f in fcast_files if model in f and 'original' in f]
    [original_file] = original_file
    original_fcasts = pd.read_csv(forecasts_path + original_file)
    
    # for each protection method
    for method in protection_methods.items():
        
        # for each parameter
        for param in method[1]:
            
            # import the forecasts based on protected data
            protected_file = [f for f in fcast_files if method[0] in f and "_"+str(param)+".csv" in f and model in f]
            
            [protected_file] = protected_file
            
            # import each protected file and calculate characteristics
            protected_fcasts = pd.read_csv(forecasts_path + protected_file)
            
            # adjustment direction
            direction = (protected_fcasts > original_fcasts).T
            
            # Calculate adjustment magnitude (absolute difference in forecasts), normalized by the mean of the series.
            series_means = [np.mean(x) for x in train_data]
            magnitudes = (protected_fcasts - original_fcasts).abs().T.divide(series_means, axis=0)

            # Calculate binary large and small adjustment indicators (belongs to 4th and 1st quantiles, respectively).
            # qs = np.quantile(magnitudes, q=[0.25, 0.75])
            # large_magnitudes = magnitudes >= qs[1]
            # small_magnitudes = magnitudes <= qs[0]
            
            # calculate original MAE
            original_errors = mean_absolute_error(test_data, original_fcasts, multioutput="raw_values")
            # calculate protected MAE
            adjusted_errors = mean_absolute_error(test_data, protected_fcasts, multioutput="raw_values")
            # calculate whether the adjustment improved accuracy
            valuable = pd.Series(adjusted_errors < original_errors)
            
            # indicators for the protection method, parameter, and model
            method_col = pd.Series(np.repeat(method[0], len(train_data))).reset_index(drop=True)
            param_col = pd.Series(np.repeat(str(param), len(train_data))).reset_index(drop=True)
            model_col = pd.Series(np.repeat(model, len(train_data))).reset_index(drop=True)
            
            # concatenate adjustment variables separately and reset the index
            X = pd.concat([direction, magnitudes], ignore_index=True, axis=1).reset_index(drop=True)
            
            # separate concatenations because the index was getting messed up
            X = pd.concat([method_col, param_col, model_col, X, valuable], axis=1)
            
            X.columns = ["Method", "Parameter", "Model", "Direction", "Magnitude", "Valuable"]
            
            forecast_characteristics.append(X)

In [None]:
adjustment_df = pd.concat(forecast_characteristics, axis=0)

In [None]:
adjustment_df.loc[adjustment_df.Method=="Top",:].mean()

In [None]:
adjustment_df.loc[adjustment_df.Method=="Top",:].groupby(["Direction"]).mean()

In [None]:
adjustment_df.loc[adjustment_df.Method=="Top",:].groupby(["Parameter"]).mean()

Include top and bottom adjustment results with k-nts adjustment results.

In [None]:
adjustment_df.loc[adjustment_df.Method=="Top",:].groupby(["Direction", "Parameter"]).mean()

In [None]:
adjustment_df.loc[adjustment_df.Method=="Bottom",:].mean(axis=0)

In [None]:
adjustment_df.loc[adjustment_df.Method=="Bottom",:].groupby(["Direction", "Parameter"]).mean()

In [None]:
adjustment_df.loc[adjustment_df.Method=="AN",:].mean(axis=0)

In [None]:
adjustment_df.loc[adjustment_df.Method=="AN",:].groupby(["Direction", "Parameter"]).mean()

In [None]:
adjustment_df.loc[adjustment_df.Method=="DP",:].mean(axis=0)

In [None]:
adjustment_df.loc[adjustment_df.Method=="DP",:].groupby(["Direction", "Parameter"]).mean()

***

Calculations of interest.

Calculate adjustment characteristics for top coding across all models.

Get the proportions of directions and large and small magnitude adjustments for top coding and differential privacy.

In [None]:
# adjustment_df.loc[adjustment_df.Method=="Top",["Direction", "Valuable"]].groupby("Direction").mean()

In [None]:
# adjustment_df.loc[adjustment_df.Method=="Top",["Direction", "Magnitude"]].mean(axis=0)

In [None]:
# adjustment_df.loc[adjustment_df.Method=="DP",["Direction", "Magnitude"]].mean(axis=0)

***

We need to combine our predictor variables.

In [None]:
# X = pd.concat([direction, magnitudes, large_magnitudes, small_magnitudes], axis=1, ignore_index=True).reset_index(drop=True)

In [None]:
# X.columns = ["Direction", "Magnitude", "Large_Magnitude", "Small_Magnitude"]

In [None]:
# X = pd.concat([X, ts_features], axis=1)

In [None]:
full_data

In [None]:
rf_data = full_data.loc[:,["is_valuable", "positive_adjustment", "magnitudes", "linearity"]]

In [None]:
y = rf_data.is_valuable

In [None]:
X = rf_data.drop('is_valuable', axis=1)

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X_scaled

***

## Predicting Adjustment Value Using Random Forest

In [None]:
RF = RandomForestClassifier()

In [None]:
RF.fit(X, y)

In [None]:
RF.score(X_scaled, y)

In [None]:
1-np.mean(y)

In [None]:
result = permutation_importance(RF, X_scaled, y, n_repeats=20)

In [None]:
importance_indices = np.flip(np.argsort(result.importances_mean))

In [None]:
importances = pd.Series(result.importances_mean[importance_indices], index=X_scaled.columns[importance_indices])

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Permutation Based Time Series Feature Importances")
ax.set_ylabel("Mean Accuracy Decrease")
ax.set_xticklabels(X_scaled.columns[importance_indices])
fig.tight_layout()
plt.show()

In [None]:
dt_features = X_scaled.columns[importance_indices]

In [None]:
dt_features

# Marginal Distributions of Important Variables

In [None]:
def marginal_plot(ax1, feature, x_axis_label, y_axis_label_left, y_axis_label_right):
    
    h1 = np.histogram(feature, bins=10)
    
    bin_ids = np.digitize(feature, h1[1])
    
    y_line = pd.concat([pd.Series(bin_ids), y], axis=1).groupby(0).mean()
    
    # fig1, ax1 = plt.subplots(1, 1)
    ax2 = ax1.twinx()
    ax3 = ax2.twiny()

    ax1.hist(feature, alpha=0.5, align='mid', weights=np.ones(len(feature)) / len(feature))

    # ax.plot(y_line)
    ax1.set_xlabel(x_axis_label)
    ax1.set_ylabel(y_axis_label_left)
    ax1.yaxis.set_major_formatter(PercentFormatter(1))

    ax3.plot(y_line)
    ax3.set_xlabel('')
    # note this needs to be ax2 due to subtle overlay issues....
    ax2.set_ylabel(y_axis_label_right)
    ax2.yaxis.set_major_formatter(PercentFormatter(1))

    ax3.axes.get_xaxis().set_ticks([])

    # plt.draw()

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(10,8))
marginal_plot(ax[0,0], X.series_mean, "Difference in Series Means", "Proportion of Series", "")
marginal_plot(ax[0,1], X.trend, "Difference in Strengths of Trend", "", "Proportion Improved Accuracy")
marginal_plot(ax[1,0], X.curvature, "Difference in Absolute Value of Curvature Coefficients", "Proportion of Series", "")
marginal_plot(ax[1,1], X.stability, "Difference in Stability", "", "Proportion Improved Accuracy")
marginal_plot(ax[2,0], X.max_var_shift, "Difference in Maximum Variance Shift", "Proportion of Series", "")
marginal_plot(ax[2,1], X.hurst, "Difference in Hurst Coefficient", "", "Proportion Improved Accuracy")
plt.tight_layout()

## Explaining Adjustment Value Using Decision Tree

Need to figure out how to generalize decision tree classifications, accounting for series with different lengths.

We extract the most important predictors of whether a privacy adjusted forecast had better accuracy.

In [None]:
DT = DecisionTreeClassifier(max_depth=3)

In [None]:
dt_results = DT.fit(X_scaled.loc[:,dt_features], y)

In [None]:
plt.figure(figsize=(15,10))
plot_tree(DT, filled=True)
plt.title("Decision tree trained on Adjustment Features")
plt.show()

***

Get the leaf indices of each sample.

In [None]:
leaf_indices = DT.apply(X_scaled.loc[:,dt_features])

Get the leaf indices.

In [None]:
unique_leaf_indices = np.unique(leaf_indices)

Get the gini impurity of each leaf node.

In [None]:
gini_leaves = DT.tree_.impurity[unique_leaf_indices]

Get sorted indices of leaf nodes sorted based on gini impurity.

In [None]:
sorted_leaves = unique_leaf_indices[np.argsort(gini_leaves)]

For each leaf, get the modal outcome.

In [None]:
[np.mean(y[leaf_indices==x]) for x in sorted_leaves]

In [None]:
leaves = []
for i in sorted_leaves[:2]:
    leaves.append(X.loc[leaf_indices==i,dt_features]) 

In [None]:
leaf_means = [x.mean(axis=0) for x in leaves]

Display the mean feature values for the series in each of the leaves with lowest gini.

In [None]:
leaf_means

Now, plot the average mean-normalized original and protected series in each leaf.

In [None]:
leaf_orig_series = []
for i in sorted_leaves[:2]:
    leaf_orig_series.append([full_data.loc[k,'original_series'] for k, l in enumerate(leaf_indices) if l==i])
    #leaf_orig_series.append(X.loc[leaf_indices==i,dt_features]) 

In [None]:
originals = [np.unique(x) for x in leaf_orig_series]

In [None]:
originals

In [None]:
originals = [[train_data[x] for x in y] for y in originals]

In [None]:
lens = np.unique([len(x) for x in originals[0]])

In [None]:
mean_series = []
for l in lens:
    current_series = [x for x in originals[0] if len(x) == l]
    current_series = pd.concat(current_series, axis=1)
    mean_series.append(current_series.mean(axis=1))

In [None]:
plot_series(mean_series[0])

In [None]:
plot_series(mean_series[1])

In [None]:
lens = np.unique([len(x) for x in originals[1]])

In [None]:
mean_series = []
for l in lens:
    current_series = [x for x in originals[1] if len(x) == l]
    current_series = pd.concat(current_series, axis=1)
    mean_series.append(current_series.mean(axis=1))

In [None]:
plot_series(mean_series[0])

In [None]:
plot_series(mean_series[1])

In [None]:
plot_series(mean_series[2])

In [None]:
lens = np.unique([len(x) for x in originals[2]])

In [None]:
mean_series = []
for l in lens:
    current_series = [x for x in originals[2] if len(x) == l]
    current_series = pd.concat(current_series, axis=1)
    mean_series.append(current_series.mean(axis=1))

In [None]:
plot_series(mean_series[0])

In [None]:
plot_series(mean_series[1])