# Isolation forest for outlier detection

In [None]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

## Load data

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style='darkgrid', palette='plasma')

from sklearn.ensemble import IsolationForest, HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
class cfg:
    seed = 42
    nfolds = 10
    njobs = 2

In [None]:
# load data
train = pd.read_csv('../data/raw/train.csv', index_col=0)
train_comb = pd.read_csv('../data/extra/train_comb.csv')

train.head()

In [None]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Combined train set shape:', train_comb.shape)

x = train.drop(TARGET, axis=1)

## Isolation Forest

In [None]:
# isolation forest for outlier detection
iso = IsolationForest(
    n_estimators=500,
    max_samples=0.3,
    contamination='auto',
    verbose=0,
    n_jobs=cfg.njobs,
    random_state=cfg.seed
)

_ = iso.fit(x)

In [None]:
# predict scores and utliers
scores = iso.score_samples(x)
isin = iso.predict(x)

print(f'The amount of predicted outliers in the train set is {sum(isin==-1)} ({sum(isin==-1)/train.shape[0]:.2%})')

In [None]:
sns.histplot(scores, bins=40, kde=True)

In [None]:
# correlation with response
c = np.corrcoef(train[TARGET], scores)[0][1]
print(f'Target correlation with anomaly scores on train test is {c:.4f}')

In [None]:
# other vars
train[FEATURES].corrwith(train[TARGET])

In [None]:
sns.violinplot(data=train, x=isin, y=TARGET)

## Does the prediction improve?

In [None]:
df1 = train.copy()
df2 = df1[isin==1]
df3 = pd.concat([train, pd.Series(scores)], axis=1)

In [None]:
# cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)
# for df, name in zip([df1, df2, df3], ['Original', 'Without outliers', 'With anomaly score']):

#     model = HistGradientBoostingRegressor()
#     x = df.drop(TARGET, axis=1)
#     y = df[TARGET]

#     scores = cross_validate(model, x, y, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=cv, n_jobs=cfg.njobs)
#     mse, mae = np.mean(-scores['test_neg_mean_squared_error']), np.mean(-scores['test_neg_mean_absolute_error'])
    
#     print(f'{name:20}mse: {mse:.4f}, mae: {mae:.4f}')

In [None]:
# # save results
# tmp = train.copy()
# tmp['Scores'] = scores
# tmp['IsOut'] = isout.astype(object)