# Error Analysis for Tabular Data.

In [None]:
# Platfrom information
%load_ext watermark
%watermark

In [None]:
#Import all libraries
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from raiwidgets import ErrorAnalysisDashboard
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
from src.model import select_features
from src.model import tune_parameters,show_model_results

In [None]:
#Package versions installed
%watermark --iversions

### Read & Clean Data

In [None]:
#Read dataset.
data = pd.read_csv('../data/heloc_dataset_v1.csv')

# Data Cleaning based on Error analysis.
data = data[data['NumSatisfactoryTrades']>=0]
data = data[data['ExternalRiskEstimate']>=0]

In [None]:
# Split data into Train & Test Set.
y = data['RiskPerformance'].apply(lambda x : 1 if 'Bad' in x else 0)
print(f"Class balance :\n{y.value_counts(normalize=True)}")
X = data.drop(columns='RiskPerformance')
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=345,stratify=y)
data_dict = {'xtrain': X_train, 'ytrain': y_train,'xtest' : X_test, 'ytest' : y_test}

### Feature Selection

In [None]:

selected_features,fs_plot = select_features(data=data_dict)

We can see that 5 features are most important, we will go ahead with only these ones.

This also makes life easy to do error analysis.

In [None]:
# Subset the dataset with the selected features.
data_dict['xtrain'] = data_dict['xtrain'][selected_features]
data_dict['xtest'] = data_dict['xtest'][selected_features]

### Train Model

In [None]:
model = XGBClassifier()
model_param = tune_parameters(data=data_dict,model=model)
# Add monotonic constraints.
model_param['monotone_constraints']=(-1,-1,-1,+1,-1)
print(f"Creating model with features : {model_param}")
clf = XGBClassifier(**model_param)
model = show_model_results(data=data_dict,model=model)

So it is `decent` model, now lets start to use the various error analysis method and see if we can do something and
improve the model. 

## Error Analysis

The error analysis library by Microsoft, deals with preddictions.

Thus we will convert the probabilities to predictions, using the above output.
0.57 seems to be a good threshold.

In [None]:
y_test_proba = model.predict_proba(data_dict['xtest'])[:,1]
predictions = np.where(y_test_proba > 0.57, 1, 0)
features = data_dict['xtrain'].columns
ErrorAnalysisDashboard(dataset=data_dict['xtest'], true_y=data_dict['ytest'], features=features, pred_y=predictions);

After error analysis we see that our model does not perfrom well on ceratin sub-population. 
Once you know that, there are few things you can do a few things, one among them is model assertion. 

The main idea of an assertion is that you you define certain thresholds, value points for which the model has seen the data. Rest the model does not know, hence the output should say so. 

Let try model stacking with only the datapoints for which we this model is making a error. 

In [None]:
from probatus.interpret import ShapModelInterpreter
shap_interpreter = ShapModelInterpreter(model)
feature_importance = shap_interpreter.fit_compute(
    data_dict['xtrain'], 
    data_dict['xtest'], 
    data_dict['ytrain'],
     data_dict['ytest'], approximate=False)
shap_interpreter.plot('importance');

We see that `ExternalRiskEstimate` is the most important feature. It also contributes to a lot of errors.

Lets look at the data and if we can stop any anamolies.

In [None]:
ax = shap_interpreter.plot('dependence', target_columns=['ExternalRiskEstimate'])

In [None]:
ax = shap_interpreter.plot('dependence', target_columns=['NumSatisfactoryTrades'])