In [None]:
# imports
import ast
import os
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# load util functions
module_path = os.getcwd()[:-9]
sys.path.insert(0, module_path)
import src.evaluation.util as util
sys.path.remove(module_path)

In [None]:
# define paths
regr_results_log_path = '../logs/regr_results/'
aggr_class_report_log_path = '../data/'

In [None]:
# load regression results table
result_file = regr_results_log_path + "regr_results.csv"
df_regr_results = pd.read_csv(result_file)

In [None]:
df_regr_results

In [None]:
# --> TODO: select index of regression to use for y_true / y_pred visualization <--
index = 0

# set regression parameters based on index and regression result table
regressor = df_regr_results.iloc[index]['regressor']

cnn = df_regr_results.iloc[index]['model']
if cnn == 'all': cnn = False

classes = df_regr_results.iloc[index]['classification_task']
if classes == 'all': classes = False

use_delta = df_regr_results.iloc[index]['delta']

group_by = df_regr_results.iloc[index]['group_by']
group_by = ast.literal_eval(group_by)
use_grouping = any(group_by.values())

zscore_threshold = df_regr_results.iloc[index]['zscore_threshold']

best_params = df_regr_results.iloc[index]['best_params']
best_params = ast.literal_eval(best_params)

estimator_name = df_regr_results.iloc[index]['estimator'][:-2]

In [None]:
# load classification results table
df_class_results_raw = util.load_aggr_class_reports(aggr_class_report_log_path)

# process data based on parameters
df_regression = df_class_results_raw.query(f'metric == "{regressor}"')
df_regression = df_regression.drop(['run'], axis=1)
df_regression = df_regression.rename(columns={'value': regressor})
df_regression = util.calc_delta(df_class_results_raw, df_regression, regressor)

# filter data for set properties
df_regression = util.filter_data(
    use_grouping, group_by, df_regression, cnn, classes
)

# filter outliers if properties are set
df_regression = util.filter_outliers(
    df_regression, regressor, use_grouping, zscore_threshold
)

df_regression.reset_index(inplace=True, drop=True)
print(f'\tRegression df.shape: {df_regression.shape}')

In [None]:
# get independent and dependent variables
if use_delta:
    X_df = df_regression[[f'{regressor}_delta']]
else:
    X_df = df_regression[[regressor]]
y_df = df_regression['ratio']

# split data - random state to reproduce split in initial model selection
test_split = 0.2 if len(df_regression) > 100 else 0.1
if use_grouping:
    X_train, X_test, y_train, y_test = train_test_split(
                                                X_df
                                                , y_df
                                                , test_size=test_split
                                                , random_state=71
                                            )
else:
    X_train, X_test, y_train, y_test = train_test_split(
                                                X_df
                                                , y_df
                                                , test_size=test_split
                                                , random_state=71
                                                , stratify=y_df
                                            )

# scale X data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# init estimator, set best params
estimator, temp_params = util.init_estimator(estimator_name)
estimator.set_params(**best_params)

# train estimator
estimator.fit(X_train, y_train)
y_train_pred = estimator.predict(X_train)
y_pred = estimator.predict(X_test)

print(f'''
    {df_regr_results.iloc[index]['n']} observations
    ---------
    Regressor: {regressor}
        {round(df_regression[regressor].mean(), 5)} mean
        {round(df_regression[regressor].std(), 5)} std
    Regressand: False Labels Ratio
        {round(df_regression['ratio'].mean(), 5)} mean
        {round(df_regression['ratio'].std(), 5)} std

    Estimator: {estimator_name}
        {round(df_regr_results.iloc[index]['neg_rmse_train'], 5)} : neg_rmse_train
        {round(df_regr_results.iloc[index]['neg_rmse_test'], 5)} : neg_rmse_test
        {round(df_regr_results.iloc[index]['r2_train'], 5)} : r2_train
        {round(df_regr_results.iloc[index]['r2_test'], 5)} : r2_test
''')

# visualize estimator performance
sns.set(rc={'figure.figsize':(11,8)})
ax = sns.scatterplot(x=y_test, y=y_pred)
ax.set(xlabel='True false labels ratio', ylabel='Predicted false labels ratio')
plt.show()