In [None]:
from collections import defaultdict
from pathlib import Path
import math
from typing import Union, Optional, List, Iterable, Dict, Tuple, Any

import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer
from matplotlib import pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm

from Analysis.hyperparameter_tuning import mean_relative_error, mean_absolute_error
from Analysis.regression_ATE import get_one_domain_out_cv, create_regression_dataset
from Analysis.analysis_utils import feature_cols, get_feature_sets, confidence_intervals
from Baselines.correlate_metrics import draw_regression, smooth

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Prevent text getting cut off in saved figures
mpl.rcParams['savefig.bbox'] = 'tight'

In [None]:
shap = False
n_grams = 'UNI'
n_concepts = 6
use_acc = True
sort_by_ate = False

In [None]:
X, y = create_regression_dataset(shap, n_grams, n_concepts, rows_sorted=sort_by_ate, use_acc=use_acc)

In [None]:
X

In [None]:
label_column = 'performance_degradation' + ('_acc' if use_acc else '')

In [None]:
X['source_acc'] - y

### Preprocessing
Mainly scaling the data. No need to remove duplicates, normalize each row separately.
Check for outliers.

Scaling the input features to standard uniform distribution.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
feature_scaler = StandardScaler()

In [None]:
TABLES_PATH = Path(r'E:\OneDrive - Technion\Technion\Graduate\Thesis tables')
IMAGES_PATH = Path(r'E:\OneDrive - Technion\Technion\Graduate\Thesis images')
TABLES_PATH.mkdir(parents=True, exist_ok=True)
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
features_to_scale = feature_cols(n_concepts)
features_to_scale.remove('source_f1')
features_to_scale.remove('source_acc')

In [None]:
X.loc[:, features_to_scale] = feature_scaler.fit_transform(X[features_to_scale])

Scaling columns having to do with percentages to percentages

In [None]:
acc_columns = ['source_acc']

In [None]:
X.loc[:, acc_columns] = X[acc_columns] * 100
if use_acc:
    y *= 100

### Exploration
Visualisations, correlations, distribution by source and target domains etc.

In [None]:
joint_df = X.copy()
joint_df.loc[:, label_column] = y
joint_df.head()

In [None]:
joint_df

In [None]:
joint_df[label_column].mean(), joint_df[label_column].std()

In [None]:
joint_df[label_column].plot(kind='kde')

In [None]:
plt.scatter(x='source_acc' if use_acc else 'source_f1', y=label_column, data=joint_df)
plt.xlabel('source accuracy')
plt.ylabel('performance drop')
plt.savefig(str(IMAGES_PATH / 'source_acc_correlation.eps'), format='eps')
plt.savefig(str(IMAGES_PATH / 'source_acc_correlation.pdf'), format='pdf')
plt.show()

In [None]:
per_source_data = joint_df.groupby('source').aggregate({label_column: [list, 'mean', 'std']})
per_source_data = per_source_data.drop(columns=[('performance_degradation_acc', 'list')]).droplevel(level=0, axis=1).sort_values('mean', ascending=False)
# per_source_data.plot(kind='bar', figsize=(136, 64))

In [None]:
per_source_data = per_source_data.reset_index(drop=False)
per_source_data['source'] = per_source_data['source'].apply(lambda domain: " ".join(domain.split('_')[1:]))

In [None]:
per_source_data.to_latex(str(TABLES_PATH / 'source_domain_stats.tex'), float_format="${:0.4f}$".format, index=False, escape=False)

In [None]:
per_target_data = joint_df.groupby('target').aggregate({label_column: [list, 'mean', 'std']})
# per_target_data.plot(kind='bar', figsize=(136, 64))

In [None]:
per_target_data = per_target_data.drop(columns=[('performance_degradation_acc', 'list')]).droplevel(level=0, axis=1).sort_values('mean', ascending=False)
per_target_data = per_target_data.reset_index(drop=False)
per_target_data['target'] = per_target_data['target'].apply(lambda domain: " ".join(domain.split('_')[1:]))
per_target_data

In [None]:
per_target_data.to_latex(str(TABLES_PATH / 'target_domain_stats.tex'), float_format="${:0.4f}$".format, index=False, escape=False)

In [None]:
per_domain_data = per_source_data.rename(columns={'source': 'Domain'}).set_index('Domain')
per_domain_data = pd.DataFrame(data=per_domain_data.values, index=per_domain_data.index, columns=[['as source'] * len(per_domain_data.columns), per_domain_data.columns])
per_domain_data[[('as target', 'mean'), ('as target', 'std')]] = per_target_data.set_index('target')
per_domain_data = per_domain_data.sort_values(('as source', 'mean'), ascending=False)
per_domain_data

In [None]:
per_domain_data.to_latex(str(TABLES_PATH / 'per_domain_stats.tex'), float_format="${:0.4f}$".format, index=True, escape=False, multicolumn=True, multicolumn_format='c', column_format='lcccc')

### Basic models
Try different models with default parameters, using basic feature configurations: all features, all non-ate and non-SHAP features, all but SHAP, all but ATE

In [None]:
metric_funcs = [
    r2_score,
    mean_relative_error,
    mean_squared_error,
    mean_absolute_error
]
metrics = [
    make_scorer(r2_score),
    make_scorer(mean_relative_error, greater_is_better=False),
    make_scorer(mean_squared_error, greater_is_better=False),
    make_scorer(mean_absolute_error, greater_is_better=False)
]
metric_names = ['r2', 'relative_error', 'RMSE', 'absolute_error']
error_metrics = ['relative_error', 'RMSE', 'absolute_error']

confidence_interval_cols = []
for metric_name in metric_names:
    confidence_interval_cols.extend([f'{metric_name}_low', f'{metric_name}_high'])

In [None]:
models = [LinearRegression(), ElasticNet(), GradientBoostingRegressor()]
models = {type(model).__name__: model for model in models}
domains_cv = get_one_domain_out_cv(X)
columns = ['model_name', 'feature_set'] + metric_names + confidence_interval_cols
scores_df = pd.DataFrame(columns=columns)

In [None]:
default_metrics_df = pd.DataFrame(columns=['model_name', 'feature_set'] + metric_names)
for model_name, model in models.items():
    cv_results = cross_validate(model, X[feature_cols(n_concepts)], y, cv=domains_cv, scoring=dict(zip(metric_names, metrics)), error_score='raise')
    default_metrics_df = default_metrics_df.append(pd.DataFrame([[model_name, 'all_features'] + [cv_results[f'test_{name}'].mean() for name in metric_names]], columns=['model_name', 'feature_set'] + metric_names))
default_metrics_df[error_metrics] *= -1

In [None]:
default_metrics_df

### Hyperparameter Tuning
Tuning hyperparameters separately for every model

In [None]:
[LinearRegression(), Lasso(), Ridge(), ElasticNet(), RandomForestRegressor(), GradientBoostingRegressor()]

In [None]:
import numpy as np

In [None]:
def get_models_param_grid(num_features):
    model_param_grids = {
        'LinearRegression': {},
        'Lasso': {
            'alpha': np.e ** np.linspace(-3, 5, 10),
            'warm_start': [True, False]
        },
        'Ridge': {
            'alpha': np.e ** np.linspace(-3, 5, 10)
        },
        'ElasticNet': {
            'alpha': np.e ** np.linspace(-3, 5, 10),
            'l1_ratio': np.linspace(0, 1, 10),
            'warm_start': [True, False],
            'max_iter': [int(1e4)]
        },
        'RandomForestRegressor': {
            'n_estimators': (2 ** np.linspace(1, 7) - 1).astype(int),
            'max_depth': list(range(1, 7)),
            'bootstrap': [True, False],
            'max_features': ['auto', 1, np.sqrt(num_features), num_features]
        },
        'GradientBoostingRegressor': {
            'n_estimators': (2 ** np.linspace(1, 7) - 1).astype(int),
            'max_depth': list(range(1, 7)),
            'max_features': ['auto', 1, int(np.sqrt(num_features)), (num_features)]
        }
    }
    return model_param_grids

In [None]:
def get_tuned_model_stats(features, metrics: Dict[str, Any], refit: str = False) -> Tuple[pd.DataFrame, Dict]:
    grids = get_models_param_grid(len(features))
    agg_cv_results = {}
    confidence_interval_cols = []
    for metric_name in metrics.keys():
        confidence_interval_cols.extend([f'{metric_name}_low', f'{metric_name}_high'])
    agg_cv_results_df = pd.DataFrame(columns=['model_name'] + list(metrics.keys()) + confidence_interval_cols)
    agg_cv_results_df.loc[:, 'model_name'] = list(models.keys())
    for model_name, model in models.items():
        gscv = GridSearchCV(model, grids[model_name], scoring=metrics, n_jobs=10, cv=domains_cv,
                            refit=refit, verbose=0)
        gscv.fit(X=X[features], y=y)
        results_df = pd.DataFrame.from_dict(gscv.cv_results_, orient='columns')
        for metric_name in metrics.keys():
            results_df[f'mean_{metric_name}'] = results_df[[f'split{i}_test_{metric_name}' for i in range(len(domains_cv))]].mean(axis=1)
        best_indices = {
            metric_name: results_df[f'mean_{metric_name}'].idxmax()
            for metric_name in metrics.keys()
        }
        scores = {
            metric_name: np.array([gscv.cv_results_[f'split{i}_test_{metric_name}'][best_indices[metric_name]] for i in range(len(domains_cv))])
            for metric_name in metrics
        }
        best_scores = {
            metric_name: metric_scores.mean()
            for metric_name, metric_scores in scores.items()
        }
        intervals = {}
        for metric_name, metric_scores in scores.items():
            if metric_name in error_metrics:
                metric_scores *= -1
            low, high = confidence_intervals(metric_scores)
            intervals[f'{metric_name}_low'] = low
            intervals[f'{metric_name}_high'] = high
        for metric_name, metric_value in best_scores.items():
            if metric_name in error_metrics:
                best_scores[metric_name] = -metric_value
        if 'RMSE' in best_scores:
            best_scores['RMSE'] = math.sqrt(best_scores['RMSE'])
        for metric in metrics.keys():
            agg_cv_results_df.loc[agg_cv_results_df['model_name'] == model_name, metric] = best_scores[metric]
            agg_cv_results_df.loc[agg_cv_results_df['model_name'] == model_name, f'{metric}_low'] = intervals[f'{metric}_low']
            agg_cv_results_df.loc[agg_cv_results_df['model_name'] == model_name, f'{metric}_high'] = intervals[f'{metric}_high']
        agg_cv_results[model_name] = {
            'params': gscv.best_params_,
            'estimator': gscv.best_estimator_,
            'scores': best_scores
        }
    return agg_cv_results_df, agg_cv_results

In [None]:
cv_results = {}

In [None]:
with tqdm(get_feature_sets(num_concepts=n_concepts).items(), desc='Tuning regressors') as pbar:
    for feature_names, features in pbar:
        pbar.set_postfix(features=feature_names)
        best_metrics, local_cv_results = get_tuned_model_stats(features, dict(zip(metric_names, metrics)), refit=f'absolute_error')
        best_metrics.loc[:, 'feature_set'] = feature_names
        scores_df = scores_df.append(best_metrics, ignore_index=True)
        cv_results[feature_names] = local_cv_results

In [None]:
scores_df[scores_df['model_name'] == 'GradientBoostingRegressor'][['feature_set', 'relative_error', 'absolute_error', 'RMSE']].to_latex(
    str(TABLES_PATH / f'gbr_{n_grams}_feature_sets.tex'),
    index=False,
    escape=False,
    float_format="${:0.4f}$".format,
)

scores_df[scores_df['model_name'] == 'ElasticNet'][['feature_set', 'relative_error', 'absolute_error', 'RMSE']].to_latex(
    str(TABLES_PATH / f'en_{n_grams}_feature_sets.tex'),
    index=False,
    escape=False,
    float_format="${:0.4f}$".format,
)

scores_df[scores_df['model_name'] == 'ElasticNet'][['feature_set', 'absolute_error', 'absolute_error_low', 'absolute_error_high']].to_latex(
    str(TABLES_PATH / f'en_{n_grams}_mae_with_ci.tex'),
    index=False,
    escape=False,
    float_format="${:0.4f}$".format
)

scores_df[scores_df['model_name'] == 'GradientBoostingRegressor'][['feature_set', 'absolute_error', 'absolute_error_low', 'absolute_error_high']].to_latex(
    str(TABLES_PATH / f'gbr_{n_grams}_mae_with_ci.tex'),
    index=False,
    escape=False,
    float_format="${:0.4f}$".format
)


In [None]:
scores_df[scores_df['model_name'] == 'GradientBoostingRegressor'][['feature_set', 'relative_error', 'absolute_error', 'RMSE']]

In [None]:
scores_df.to_excel(r'E:\OneDrive - Technion\Technion\Graduate\Research\analysis tables\kmeans_metrics_per_feature_set.xlsx')

In [None]:
for metric_name in ['absolute_error', 'relative_error', 'RMSE']:
    scores_df[scores_df['model_name'] == 'GradientBoostingRegressor'][['feature_set', metric_name]]\
        .plot(kind='bar', x='feature_set')
    plt.ylabel(metric_name)
    plt.savefig(str(IMAGES_PATH / f'generalization_results_{metric_name}_{n_grams}_gbr.eps'), format='eps')

for metric_name in ['absolute_error', 'relative_error', 'RMSE']:
    scores_df[scores_df['model_name'] == 'ElasticNet'][['feature_set', metric_name]]\
        .plot(kind='bar', x='feature_set')
    plt.ylabel(metric_name)
    plt.savefig(str(IMAGES_PATH / f'generalization_results_{metric_name}_{n_grams}_en.eps'), format='eps')

In [None]:
print(scores_df)

In [None]:
desired_metrics = ['relative_error', 'absolute_error']
per_model_data = pd.concat([scores_df[scores_df['model_name'] == model_name].set_index('feature_set')[desired_metrics] for model_name in scores_df['model_name'].unique()], keys=scores_df['model_name'].unique().tolist(), axis=1)
per_model_data.to_latex(str(TABLES_PATH / f'all_model_comparison_{n_grams}.tex'), index=True, multicolumn=True, float_format="${:0.4g}$".format, multicolumn_format='c', escape=False, column_format='l|cc|cc|cc')
per_model_data[['ElasticNet', 'GradientBoostingRegressor']].to_latex(str(TABLES_PATH / f'part_model_comparison_{n_grams}.tex'), index=True, multicolumn=True, float_format="${:0.4g}$".format, multicolumn_format='c', escape=False, column_format='l|cc|cc')
per_model_data[['LinearRegression', 'ElasticNet']].to_latex(str(TABLES_PATH / f'linear_model_comparison_{n_grams}.tex'), index=True, multicolumn=True, float_format="${:0.4g}$".format, multicolumn_format='c', escape=False, column_format='l|cc|cc')

#### Analyzing the tuned models

Analyzing the feature importance

In [None]:
from itertools import product

In [None]:
def get_weights_equivalent(results_dict: Dict[str, Any], feature_set: str, estimator_class: str) -> Iterable[float]:
    estimator = results_dict[feature_set][estimator_class]['estimator']
    if type(estimator) in (LinearRegression, Lasso, Ridge, ElasticNet):
        return estimator.coef_
    else:
        assert (isinstance(estimator, GradientBoostingRegressor) or isinstance(estimator, RandomForestRegressor)), f'expected tree ensemble type, got {type(estimator)}'
        return estimator.feature_importances_

def features_by_importance(results_dict: Dict[str, Any], num_concepts=n_concepts) -> pd.DataFrame:
    results = []
    for (feature_set_name, feature_set), model_name in product(get_feature_sets(num_concepts).items(), models.keys()):
        results.append([
            feature_set_name,
            model_name,
            sorted(zip(feature_set, get_weights_equivalent(results_dict, feature_set_name, model_name)), key=lambda x: -x[1])
        ])
    feature_importance_df = pd.DataFrame.from_records(results, columns=['feature_set', 'model', 'feature_importances'])
    feature_importance_df = feature_importance_df.explode('feature_importances', ignore_index=True)
    feature_importance_df.loc[:, ['feature_name', 'feature_importance']] = feature_importance_df['feature_importances'].tolist()
    return feature_importance_df

In [None]:
outlier_coefficient = 10
feature_importances_df = features_by_importance(cv_results)
feature_importances_df = feature_importances_df[feature_importances_df['feature_importance'].abs() > 0]
# Removing lines with extreme feature importances (over 6 times the STD above or below the mean)
# feature_importances_df = feature_importances_df[(feature_importances_df['feature_importance'] - feature_importances_df['feature_importance'].mean()).abs() <= feature_importances_df['feature_importance'].std() * outlier_coefficient]

In [None]:
feature_importances_df[(~feature_importances_df['feature_set'].isin(['shap', 'shap + baseline', 'shap + baseline + concept DF'])) & (feature_importances_df['model'] == 'LinearRegression')].groupby('feature_name').mean().sort_values(by='feature_importance', ascending=False)[:15].plot(kind='bar')
plt.savefig(str(IMAGES_PATH / f'{n_grams}_lr_no_shap_feature_importance.eps'), format='eps')
feature_importances_df[(~feature_importances_df['feature_set'].isin(['shap', 'shap + baseline', 'shap + baseline + concept DF'])) & (feature_importances_df['model'] == 'ElasticNet')].groupby('feature_name').mean().sort_values(by='feature_importance', ascending=False)[:15].plot(kind='bar')
plt.savefig(str(IMAGES_PATH / f'{n_grams}_en_no_shap_feature_importance.eps'), format='eps')
feature_importances_df[(~feature_importances_df['feature_set'].isin(['shap', 'shap + baseline', 'shap + baseline + concept DF'])) & (feature_importances_df['model'] == 'GradientBoostingRegressor')].groupby('feature_name').mean().sort_values(by='feature_importance', ascending=False)[:15].plot(kind='bar')
plt.savefig(str(IMAGES_PATH / f'{n_grams}_gbr_no_shap_feature_importance.eps'), format='eps')

Analyzing error distribution for different feature sets

In [None]:
model_to_analyze = 'GradientBoostingRegressor'

In [None]:
def display_error_distribution(X, y, feature_set: str, results_dict: Dict, model_to_analyze: str, metrics: Dict, error_type: str = 'absolute_error'):
    loocv = get_one_domain_out_cv(X)
    raw_errors = {metric: [] for metric in metrics}
    best_model_stats = results_dict[feature_set][model_to_analyze]
    model_class = type(best_model_stats['estimator'])
    model_params = best_model_stats['params']
    features = get_feature_sets(n_concepts)[feature_set]
    for train_idx, test_idx in loocv:
        model = model_class(**model_params)
        model.fit(X.loc[X.index.isin(train_idx), features], y.iloc[train_idx])
        y_pred = model.predict(X.loc[X.index.isin(test_idx), features])
        for metric_name, scorer in metrics.items():
            raw_errors[metric_name].extend([scorer(pd.Series([sample_true]), pd.Series([sample_pred])) for sample_true, sample_pred in zip(y.iloc[test_idx], y_pred)])
    errors_df = pd.DataFrame.from_dict(raw_errors)
    errors_df[error_type].plot(kind='kde', title=feature_set)
    return errors_df

In [None]:
_ = display_error_distribution(X, y, 'ates', cv_results, model_to_analyze, dict(zip(metric_names[1:], metric_funcs[1:])), error_type='absolute_error')
plt.figure()
_ = display_error_distribution(X, y, 'shap', cv_results, model_to_analyze, dict(zip(metric_names[1:], metric_funcs[1:])), error_type='absolute_error')

Displaying regression plot for the learned models

In [None]:
def draw_regression_curve(feature_set: str, num_concepts: int = n_concepts, confidence: float = 0.95):
    features = get_feature_sets(num_concepts)[feature_set]
    prediction_intervals = defaultdict(list)
    plt.figure(figsize=(7, 4))
    ax = plt.gca()
    domains_cv = get_one_domain_out_cv(X, add_domain_name=True)
    for x_train_idx, x_test_idx, domain_name in tqdm(domains_cv):
        x_train = X[X.index.isin(x_train_idx)][features]
        x_test = X[X.index.isin(x_test_idx) & (X['source'] != domain_name)][features]
        y_train = y[y.index.isin(x_train_idx)]
        domain_y_true = y[y.index.isin(x_test_idx) & (X['source'] != domain_name)]
        estimator_template = cv_results[feature_set]['GradientBoostingRegressor']['estimator']
        estimator_params = estimator_template.get_params()

        for key in ['loss', 'alpha']:
            try:
                del estimator_params[key]
            except KeyError:
                continue

        estimators = {
            'low': type(estimator_template)(**estimator_params, loss='quantile', alpha=1 - confidence),
            'median': type(estimator_template)(**estimator_params, loss='quantile', alpha=0.5),
            'high': type(estimator_template)(**estimator_params, loss='quantile', alpha=confidence)
        }
        estimators = {
            key: value.fit(x_train, y_train)
            for key, value in estimators.items()
        }
        y_preds = {
            key: value.predict(x_test)
            for key, value in estimators.items()
        }
        for key in y_preds:
            prediction_intervals[key].extend(y_preds[key])
        prediction_intervals['source_acc'].extend(x_test['source_acc'].tolist())
        ax = draw_regression(x_test['source_acc'] - y_preds['median'], x_test['source_acc'] - domain_y_true, x_label='y_pred', y_label='y_true', label=domain_name)

    prediction_intervals = pd.DataFrame.from_dict(prediction_intervals, orient='columns')
    prediction_intervals.loc[:, 'x_axis'] = prediction_intervals['source_acc'] - prediction_intervals['median']
    prediction_intervals = prediction_intervals.drop_duplicates(subset=['x_axis'])
    prediction_intervals = prediction_intervals.sort_values('x_axis')
    print(len(prediction_intervals.index))
    ax.plot(
        prediction_intervals['x_axis'], smooth(prediction_intervals['source_acc'] - prediction_intervals['low'], 0.5), 'k:',
        prediction_intervals['x_axis'], smooth(prediction_intervals['source_acc'] - prediction_intervals['high'], 0.5), 'k:',
    )
    plt.savefig(str(IMAGES_PATH / f'{n_grams}_{feature_set}_regression_match_plot.png'), format='png')
    plt.savefig(str(IMAGES_PATH / f'{n_grams}_{feature_set}_regression_match_plot.eps'), format='eps')
    return plt.gca()

In [None]:
draw_regression_curve('all - performance', num_concepts=n_concepts)

### Analyzing the fit capabilities of the models

In [None]:
def get_model_fit_metrics(X: pd.DataFrame, y: pd.Series, metrics: Dict[str, Any]) -> pd.DataFrame:
    fit_metrics = []
    for feature_set, features in get_feature_sets(num_concepts=n_concepts).items():
        feature_set_estimators = cv_results[feature_set]
        for estimator_class in feature_set_estimators.keys():
            estimator = feature_set_estimators[estimator_class]['estimator']
            scores = {
                metric_name: metric(estimator, X[features], y)
                for metric_name, metric in metrics.items()
            }
            fit_metrics.append(dict(feature_set=feature_set, model_class=estimator_class, **scores))
    results_df = pd.DataFrame.from_records(fit_metrics)
    results_df['RMSE'] = -(np.sqrt((-results_df['RMSE'])))
    results_df[error_metrics] *= -1
    return results_df

In [None]:
fit_results_df = get_model_fit_metrics(X, y, dict(zip(metric_names, metrics)))
fit_results_df

In [None]:
for metric_name in ['relative_error', 'RMSE', 'absolute_error']:
    fit_results_df[fit_results_df['model_class'] == 'GradientBoostingRegressor'][['feature_set', metric_name]]\
        .plot(kind='bar', x='feature_set')
    plt.ylabel(metric_name)
    plt.savefig(str(IMAGES_PATH / f'fit_results_{metric_name}_{n_grams}_gbr.eps'), format='eps')
    plt.show()

for metric_name in ['relative_error', 'RMSE', 'absolute_error']:
    fit_results_df[fit_results_df['model_class'] == 'ElasticNet'][['feature_set', metric_name]]\
        .plot(kind='bar', x='feature_set')
    plt.ylabel(metric_name)
    plt.savefig(str(IMAGES_PATH / f'fit_results_{metric_name}_{n_grams}_en.eps'), format='eps')
    plt.show()

In [None]:
for metric_name in ['relative_error', 'RMSE', 'absolute_error']:
    fit_results_df[fit_results_df['model_class'] == 'LinearRegression'][['feature_set', metric_name]]\
        .plot(kind='bar', x='feature_set')
    plt.ylabel(metric_name)
    plt.savefig(str(IMAGES_PATH / f'fit_results_{metric_name}_{n_grams}_lr.eps'), format='eps')
    plt.show()

### The effect of #_concepts
Testing the effect of the amount of concepts fed to the estimator on the estimator performance.

In [None]:
per_concept_num_cv_results = {}
specialized_metric_names = [f'best_{metric}' for metric in metric_names]
per_concept_num_scores_df = pd.DataFrame(columns=['model_name', 'feature_set', 'num_concepts'] + specialized_metric_names)
for local_n_concepts in tqdm(range(0, n_concepts + 1), desc='n_concepts', leave=False):
    per_concept_num_cv_results[local_n_concepts] = {}
    for feature_names, features in get_feature_sets(local_n_concepts).items():
        if len(features) == 0:
            continue
        best_metrics, local_cv_results = get_tuned_model_stats(features, dict(zip(specialized_metric_names, metrics)), refit=f'best_absolute_error')
        best_metrics.loc[:, 'feature_set'] = feature_names
        best_metrics.loc[:, 'num_concepts'] = local_n_concepts
        per_concept_num_scores_df = per_concept_num_scores_df.append(best_metrics, ignore_index=True)
        per_concept_num_cv_results[local_n_concepts][feature_names] = local_cv_results
per_concept_num_scores_df.loc[:, 'best_RMSE'] = -1 * np.sqrt(-1 * per_concept_num_scores_df['best_RMSE'].astype(float).values)

In [None]:
per_concept_num_scores_df.loc[:, 'best_RMSE'] = per_concept_num_scores_df.apply(lambda row: per_concept_num_cv_results[row['num_concepts']][row['feature_set']][row['model_name']]['scores']['best_RMSE'], axis=1)
per_concept_num_scores_df.head()

In [None]:
per_concept_num_scores_df.loc[:, 'best_RMSE'] = -1 * (np.sqrt(-1 * per_concept_num_scores_df['best_RMSE'].astype(float).values))

In [None]:
per_concept_num_scores_df[['num_concepts', 'best_RMSE', 'best_absolute_error', 'best_relative_error']].groupby('num_concepts').max().plot(figsize=(16, 12), marker='o')
plt.savefig(str(IMAGES_PATH / f'{n_grams}_performance_per_n_concepts.eps'), format='eps')

In [None]:
n_grams