# Install Olliepy

In [None]:
%%capture
!pip install -U Olliepy

# Import packages

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from olliepy import RegressionErrorAnalysisReport

# Import data

In [None]:
train_df = pd.read_csv('./data/BMI_train.csv')
test_df = pd.read_csv('./data/BMI_test.csv')

In [None]:
train_df.head()

# Build model

In [None]:
def plot_error_distribution(y_true, y_pred):
    error_df = pd.Series(data=y_pred-y_true, name='Error')
    error_df.hist(bins=100)

In [None]:
def score_model(X_test, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - (1-r2)*(X_test.shape[0] - 1) / (X_test.shape[0] - (X_test.shape[1] - 1) - 1)
    print('MAE: {}'.format(mean_absolute_error(y_true, y_pred)))
    print('R^2: {}'.format(r2))
    print('Adjusted R^2: {}'.format(adj_r2))

In [None]:
categorical_features = ['gender']
numerical_features = ['weight', 'height']

target_feature = 'BMI'

In [None]:
transformation_pipeline = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    remainder=StandardScaler()
)

pipeline = make_pipeline(transformation_pipeline,
                         RandomForestRegressor(random_state=77, n_jobs=-1))

In [None]:
X_train = train_df.drop(target_feature, axis=1)
X_test = test_df.drop(target_feature, axis=1)
y_train = train_df.loc[:, target_feature]
y_test = test_df.loc[:, target_feature]

In [None]:
pipeline.fit(X_train, y_train);

# Train results

In [None]:
y_train_pred = pipeline.predict(X_train)

In [None]:
plot_error_distribution(y_train, y_train_pred)

In [None]:
train_df['error'] = y_train_pred - y_train

# test results

In [None]:
y_test_pred = pipeline.predict(X_test)

In [None]:
score_model(X_test, y_test, y_test_pred)

# Calculate test error classes

In [None]:
test_df['error'] = y_test_pred - y_test

In [None]:
plot_error_distribution(y_test, y_test_pred)

In [None]:
error_classes = {
    'EXTREME_UNDER_ESTIMATION': (-8.0, -4.0),
    'HIGH_UNDER_ESTIMATION': (-4.0, -3.0),
    'MEDIUM_UNDER_ESTIMATION': (-3.0, -1.0),
    'LOW_UNDER_ESTIMATION': (-1.0, -0.5),
    'ACCEPTABLE': (-0.5, 0.5),
    'OVER_ESTIMATING': (0.5, 3.0)
}

# OlliePy Report

In [None]:
from olliepy import RegressionErrorAnalysisReport

In [None]:
report = RegressionErrorAnalysisReport(
                train_df=train_df,
                test_df=test_df,
                target_feature_name='BMI',
                error_column_name='error',
                error_classes=error_classes,
                acceptable_error_class='ACCEPTABLE',
                numerical_features=numerical_features,
                categorical_features=categorical_features,
                title='BMI Regression Report',
                subtitle='BMI distribution shift',
                output_directory='.',
                report_folder_name='BMI_REPORT',
                generate_encryption_secret=False)

In [None]:
report.create_report()

## Serve report and display in a new browser tab

In [None]:
report.serve_report_from_local_server(mode='server')

## Serve report and display in jupyter

In [None]:
report.serve_report_from_local_server(mode='jupyter')

## Save report and zip it to share it with someone or download it and display it locally if you are using a cloud solution

In [None]:
report.save_report(zip_report=True)