# Random Forest Imputations
Random forest worked best up to now. Up to now, We've mainly used zero imputations and a K Means imputation in a Random Forest. Let's take a look what other imputation methods would do, both for the case of one Random Forest model for the two chains together and for two Random Forest models for the two chains separately.

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from util import get_train_dataset, get_features, fix_test, evaluate
import numpy as np
from sklearn import impute

df = get_train_dataset()
# df = df.sample(n=1000)

df_reaction_column = df['reaction']
df_features = get_features(df)

df_features.reset_index(drop=True, inplace=True)
df_reaction_column.reset_index(drop=True, inplace=True)

assert df_features.shape[0] == df_reaction_column.shape[0]
assert np.isnan(df_reaction_column).sum().sum() == 0

# add the reaction column to the features (will be dropped later on)
df_features['reaction'] = df_reaction_column

In [3]:
y = df_features['reaction']
x = df_features.drop(['reaction'], axis=1)

assert 'reaction' not in x.columns
assert np.isnan(y).sum() == 0

In [4]:
# get sklearn version
from sklearn import __version__ as sklearn_version
print('The scikit-learn version is {}.'.format(sklearn_version))

The scikit-learn version is 1.1.0.


## Imputations

In [5]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)

### Simple Imputer

#### Zero Imputation

In [6]:
# zero imputation
imputer_zero = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x_zero_imputed = imputer_zero.fit_transform(x)

In [7]:
evaluate(clf, x_zero_imputed, y)

[0.88211648 0.88849494 0.87463934 0.85895543 0.86702376]
ROC: 0.874 (+/- 0.021)


#### Mean Imputation

In [8]:
# Mean imputation
imputer_mean = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
x_mean_imputed = imputer_mean.fit_transform(x)

In [9]:
evaluate(clf, x_mean_imputed, y)

[0.87717905 0.88396071 0.87404344 0.86072808 0.86775395]
ROC: 0.873 (+/- 0.016)


#### Most Frequent Imputation

In [10]:
# Most frequent imputation
imputer_most_frequent = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_most_frequent_imputed = imputer_most_frequent.fit_transform(x)

  mode = stats.mode(array)


In [11]:
evaluate(clf, x_most_frequent_imputed, y)

[0.87694658 0.88724911 0.8685433  0.85760671 0.86910991]
ROC: 0.872 (+/- 0.020)


#### Median Imputation

In [12]:
# Median imputation
imputer_median = impute.SimpleImputer(missing_values=np.nan, strategy='median')
x_median_imputed = imputer_median.fit_transform(x)

In [13]:
evaluate(clf, x_median_imputed, y)

[0.87087147 0.8889853  0.87565571 0.8607092  0.86855012]
ROC: 0.873 (+/- 0.019)


#### Multivariate Imputation

In [14]:
# Takes way too long

# Multivariate imputer that estimates each feature from all the others.
# A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion
# src: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html

# Extra importe required: https://stackoverflow.com/questions/55846680/can-not-import-iterativeimputer-from-sklearn-impute
from sklearn.experimental import enable_iterative_imputer

# imputer = impute.IterativeImputer(max_iter=10, random_state=0)
# x_iterative_imputed = imputer.fit_transform(x)

In [15]:
# evaluate(clf, x_iterative_imputed, y)

### Random Forest Imputation
Use a random forest regression model to impute the missing values.


In [None]:
# Random Forest imputation
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
imputer_random_forest = impute.IterativeImputer(estimator=RandomForestRegressor(random_state=0), random_state=0)
x_random_forest_imputed = imputer_random_forest.fit_transform(x)

In [None]:
evaluate(clf, x_random_forest_imputed, y)

## Missing Indicator

In [None]:
# Note: It's important to create the missing indication based on df and not the features generated out of it, since the features will always have missing values (because of the positional features)
df.head()

In [None]:
# Do a constant 0 imputation, but add an indicator whether the sample contained any missing value or not
# Create a vector representing whether a row has a missing value or not
missing_indicator = impute.MissingIndicator(missing_values=np.nan)
missing_indicator.fit_transform(df)[:5]

In [None]:
# replace each row in missing_indicator with it's "any" value
x_any_missing = missing_indicator.transform(df).any(axis=1)
x_any_missing[:5]

In [None]:
# Create a copy of x
x_missing_indicator = x.copy()
assert x_missing_indicator.shape[0] == x_any_missing.shape[0]

In [None]:
# Add a column to x_missing_indicator with the missing indicator
x_missing_indicator['any_missing'] = x_any_missing

In [None]:
x_missing_indicator

In [None]:
imputer_zero = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x_zero_imputed_and_missing_indicator = imputer_zero.fit_transform(x_missing_indicator)

In [None]:
evaluate(clf, x_zero_imputed_and_missing_indicator, y)

## Test set without missing values

In [None]:
from sklearn.model_selection import train_test_split
from util import get_train_dataset, get_features, fix_test, evaluate, calculate_auc_and_plot

df = get_train_dataset()
# df = df.sample(n=1000) # for faster debugging

train, test = train_test_split(df, test_size=0.2, random_state=42)

test.dropna(inplace=True)

In [None]:
x = get_features(train)
y = train['reaction']

In [None]:
x_test = get_features(test, test=True)
x_test  = fix_test(x_test, x.columns)
y_test = test['reaction']

In [None]:
clf = RandomForestClassifier(random_state=0)

In [None]:
def evaluate_imputer(imputer):
    x_imputed = imputer.fit_transform(x)
    clf.fit(x_imputed, y)
    x_test_imputed = imputer.transform(x_test)
    y_pred = clf.predict_proba(x_test_imputed)[:, 1]
    auc = calculate_auc_and_plot(y_test, y_pred)
    print(f'AUC: {auc:.3f}')

In [None]:
def get_columns_starting_with(df, prefix):
    return df[df.columns[df.columns.str.startswith(prefix)]]

In [None]:
clf1 = RandomForestClassifier(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
def evaluate_imputer_seperate_chains(imputer1, imputer2):
    # Keep only the columns starting with 'alfa_'
    x_alpha = get_columns_starting_with(x, 'alfa_')
    x_beta = get_columns_starting_with(x, 'beta_')

    x_imputed_alpha = imputer1.fit_transform(x_alpha)
    x_imputed_beta = imputer2.fit_transform(x_beta)

    clf1.fit(x_imputed_alpha, y)
    clf2.fit(x_imputed_beta, y)

    x_test_alpha = get_columns_starting_with(x_test, 'alfa_')
    x_test_beta = get_columns_starting_with(x_test, 'beta_')

    x_test_alpha = fix_test(x_test_alpha, x_alpha.columns)
    x_test_beta = fix_test(x_test_beta, x_beta.columns)

    x_test_imputed_alpha = imputer1.transform(x_test_alpha)
    x_test_imputed_beta = imputer2.transform(x_test_beta)

    y_pred1 = clf1.predict_proba(x_test_imputed_alpha)[:, 1]
    y_pred2 = clf2.predict_proba(x_test_imputed_beta)[:, 1]

    y_pred = (y_pred1 + y_pred2) / 2
    auc = calculate_auc_and_plot(y_test, y_pred)

    print(f'AUC: {auc:.3f}')

### Zero Imputation

In [None]:
# Zero imputation
imputer_zero = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imputer_zero2 = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

In [None]:
evaluate_imputer(imputer_zero)

In [None]:
evaluate_imputer_seperate_chains(imputer_zero, imputer_zero2)

In [None]:
# Mean imputation
imputer_mean = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_mean2 = impute.SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
evaluate_imputer(imputer_mean)

In [None]:
evaluate_imputer_seperate_chains(imputer_mean, imputer_mean2)

In [None]:
# Most frequent imputation
imputer_most_frequent = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_most_frequent2 = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent')
evaluate_imputer(imputer_most_frequent)

In [None]:
evaluate_imputer_seperate_chains(imputer_most_frequent, imputer_most_frequent2)

In [None]:
# Median imputation
imputer_median = impute.SimpleImputer(missing_values=np.nan, strategy='median')
imputer_median2 = impute.SimpleImputer(missing_values=np.nan, strategy='median')
evaluate_imputer(imputer_median)

In [None]:
evaluate_imputer_seperate_chains(imputer_median, imputer_median2)

### Missing Indicator

In [None]:
missing_indicator = impute.MissingIndicator(missing_values=np.nan)
x_any_missing_train = missing_indicator.fit_transform(train).any(axis=1)
x_any_missing_test = missing_indicator.transform(test).any(axis=1)
assert x_any_missing_train.shape[0] == x.shape[0]
assert x_any_missing_test.shape[0] == x_test.shape[0]

In [None]:
x_missing_indicator = x.copy()
x_missing_indicator_test = x_test.copy()

# Add a column to x_missing_indicator with the missing indicator
x_missing_indicator['any_missing'] = x_any_missing_train
x_missing_indicator_test['any_missing'] = x_any_missing_test

In [None]:
imputer_zero = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x_zero_imputed_and_missing_indicator = imputer_zero.fit_transform(x_missing_indicator)
x_zero_imputed_and_missing_indicator_test = imputer_zero.transform(x_missing_indicator_test)

In [None]:
clf.fit(x_zero_imputed_and_missing_indicator, y)
y_pred = clf.predict_proba(x_zero_imputed_and_missing_indicator_test)[:, 1]
auc = calculate_auc_and_plot(y_test, y_pred)
print(f'AUC: {auc:.3f}')