# Imports & Functions

In [None]:
from functions.metrics import nrmse_adjusted
from functions.utils import to_NAN, find_first_value
from functions.import_data import import_datasets

from pyspark.sql import SparkSession
from azure.storage.blob import ContainerClient
from blob_credentials import facts_sas_token, facts_container, workspace_sas_token, workspace_container

import pandas as pd
import numpy as np
import rpy2
import impyute
import seaborn as sns

from tqdm import tqdm

from scipy.interpolate import interp1d

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector

sns.set_style('darkgrid')

In [None]:
def impute_quadraticInterpolation(df):
    """
    Function that Interpolates missing values
    Input:
        - df: pd.DataFrame with NaNs
    Output:
        - pd.DataFrame  
    """
    df_imputed = df.copy()
    for col in tqdm(df.columns):
        x = df.loc[:, col].dropna().index.values
        y = df.loc[:, col].dropna().values
        first_val = find_first_value(df.loc[:, col].values)
        if first_val == 'NaN':
            continue
        else:
            all_x = df.loc[first_val:, col].index.values
            f = interp1d(x, y, kind='quadratic', fill_value='extrapolate')
            df_imputed.loc[first_val:, col] = f(all_x)
    
    return df_imputed

# Spark Session

In [None]:
myname = "marc-samvath-philippe.vigneron"

spark = SparkSession \
    .builder \
    .appName(f"Test-{myname}") \
    .config("spark.executor.instance", "1") \
    .config("spark.executor.memory","512m") \
    .config('spark.jars.packages',"org.apache.hadoop:hadoop-azure:3.1.1") \
    .config("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") \
    .config("fs.wasbs.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem") \
    .config(f"fs.azure.sas.{facts_container}.hecdf.blob.core.windows.net", facts_sas_token) \
    .config(f"fs.azure.sas.{workspace_container}.hecdf.blob.core.windows.net", workspace_sas_token) \
    .getOrCreate()

# Load Data

In [None]:
# loading the data
dataset_challenge_gbm = spark.read.parquet(f'wasbs://{workspace_container}@hecdf.blob.core.windows.net/{myname}/generated_data_gbm.parquet').toPandas()
dataset_challenge_kde = spark.read.parquet(f'wasbs://{workspace_container}@hecdf.blob.core.windows.net/{myname}/generated_data_kde.parquet').toPandas()
dataset_challenge = import_datasets()[0]
dataset_challenge.drop(columns=["Date"], inplace=True)

In [None]:
# Coverting the values to nan
dataset_challenge_gbm_nan = to_NAN(dataset_challenge_gbm, dataset_challenge)
dataset_challenge_kde_nan = to_NAN(dataset_challenge_kde, dataset_challenge)

# Last Observation Carried Forward (LOCF)

In [None]:
kde_locf = dataset_challenge_kde_nan.fillna(method='ffill')
gbm_locf = dataset_challenge_gbm_nan.fillna(method='ffill')

In [None]:
results_kde = nrmse_adjusted(dataset_challenge_kde.values, 
                             kde_locf.values,
                             dataset_challenge_kde_nan.values)
nrmses_kde = np.nanmean(np.array(list(i[0] for i in results_kde.values())))
print("KDE NRMSE LOCF : %f" % nrmses_kde)

In [None]:
results_gbm = nrmse_adjusted(dataset_challenge_gbm.values, 
                             gbm_locf.values,
                             dataset_challenge_gbm_nan.values)
nrmses_gbm = np.nanmean(np.array(list(i[0] for i in results_gbm.values())))
print("GBM  NRMSE LOCF: %f" % nrmses_gbm)

# Quadratic Interpolation

In [None]:
kde_interpolation = impute_quadraticInterpolation(dataset_challenge_kde_nan)
gbm_interpolation = impute_quadraticInterpolation(dataset_challenge_gbm_nan)

In [None]:
results_kde = nrmse_adjusted(dataset_challenge_kde.values, 
                             kde_interpolation.values,
                             dataset_challenge_kde_nan.values)
nrmses_kde = np.nanmean(np.array(list(i[0] for i in results_kde.values())))
print("KDE NRMSE LOCF : %f" % nrmses_kde)

In [None]:
results_gbm = nrmse_adjusted(dataset_challenge_gbm.values, 
                             gbm_interpolation.values,
                             dataset_challenge_gbm_nan.values)
nrmses_gbm = np.nanmean(np.array(list(i[0] for i in results_gbm.values())))
print("GBM  NRMSE LOCF: %f" % nrmses_gbm)

# Weighted Moving Average

## Conversion to R datasets

In order to run the R package work, we convert the the dataframes into R datasets and import the neccessary R package

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    r_dataset_challenge = ro.conversion.py2rpy(dataset_challenge)
    r_dataset_challenge_gbm = ro.conversion.py2rpy(dataset_challenge_gbm)
    r_dataset_challenge_kde = ro.conversion.py2rpy(dataset_challenge_kde)
    r_dataset_challenge_gbm_nan = ro.conversion.py2rpy(dataset_challenge_gbm_nan)
    r_dataset_challenge_kde_nan = ro.conversion.py2rpy(dataset_challenge_kde_nan)

In [None]:
# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

In [None]:
utils.install_packages("imputeTS")
imputeTS = importr('imputeTS')

## Imputation

**NOTE**: it takes several hours to run. 

In [None]:
r_dataset_challenge_gbm_imputed = imputeTS.na_ma(r_dataset_challenge_gbm_nan)
r_dataset_challenge_kde_imputed = imputeTS.na_ma(r_dataset_challenge_kde_nan)

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    dataset_challenge_gbm_imputed = ro.conversion.rpy2py(r_dataset_challenge_gbm_imputed)
    dataset_challenge_kde_imputed = ro.conversion.rpy2py(r_dataset_challenge_kde_imputed)

In [None]:
results_gbm = nrmse_adjusted(dataset_challenge_gbm.values, 
                             dataset_challenge_gbm_imputed.values,
                             dataset_challenge_gbm_nan.values)

nrmses_gbm = np.nanmean(np.array(list(i[0] for i in results_gbm.values())))
print("GBM Weighted Moving Average Mean NRMSE: %f" % nrmses_gbm)

In [None]:
results_kde = nrmse_adjusted(dataset_challenge_kde.values, 
                             dataset_challenge_kde_imputed.values,
                             dataset_challenge_kde_nan.values)
nrmses_kde = np.nanmean(np.array(list(i[0] for i in results_kde.values())))
print("KDE Weighted Moving Average Mean NRMSE: %f" % nrmses_kde)