In [None]:
import json
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from prettytable import PrettyTable


from eliot import to_file
from saiva.model.shared.constants import MODEL_TYPE
to_file(sys.stdout)

MODEL_TYPE = MODEL_TYPE.lower()
print('MODEL:', MODEL_TYPE)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

processed_path = Path('/data/processed')
raw_path = Path('/data/raw')

In [None]:
base_df = pd.read_parquet(processed_path/'final_df.parquet')

In [None]:
CUMULATIVE_GROUP_MAPPING = {
    r'^cumsum_2_day_alert_.*': 'cumsum_2_day_alert',
    r'^cumsum_7_day_alert_.*': 'cumsum_7_day_alert',
    r'^cumsum_14_day_alert_.*': 'cumsum_14_day_alert',
    r'^cumsum_30_day_alert_.*': 'cumsum_30_day_alert',
    r'^cumsum_all_alert_.*': 'cumsum_all_alert',
    r'^cumsum_2_day_dx_.*': 'cumsum_2_day_dx',
    r'^cumsum_7_day_dx_.*': 'cumsum_7_day_dx',
    r'^cumsum_14_day_dx_.*': 'cumsum_14_day_dx',
    r'^cumsum_30_day_dx_.*': 'cumsum_30_day_dx',
    r'^cumsum_all_dx_.*': 'cumsum_all_dx',
     r'^cumsum_2_day_med_.*': 'cumsum_2_day_med',
    r'^cumsum_7_day_med_.*': 'cumsum_7_day_med',
    r'^cumsum_14_day_med_.*': 'cumsum_14_day_med',
    r'^cumsum_30_day_med_.*': 'cumsum_30_day_med',
    r'^cumsum_all_med_.*': 'cumsum_all_med',
    r'^cumsum_2_day_order_.*': 'cumsum_2_day_order',
    r'^cumsum_7_day_order_.*': 'cumsum_7_day_order',
    r'^cumsum_14_day_order_.*': 'cumsum_14_day_order',
    r'^cumsum_30_day_order_.*': 'cumsum_30_day_order',
    r'^cumsum_all_order_.*': 'cumsum_all_order',
    r'^cumsum_2_day_labs_.*': 'cumsum_2_day_labs',
    r'^cumsum_7_day_labs_.*': 'cumsum_7_day_labs',
    r'^cumsum_14_day_labs_.*': 'cumsum_14_day_labs',
    r'^cumsum_30_day_labs_.*': 'cumsum_30_day_labs',
    r'^cumsum_all_labs_.*': 'cumsum_all_labs',
    
    r'^cumidx_2_day_alert_.*': 'cumidx_2_day_alert',
    r'^cumidx_7_day_alert_.*': 'cumidx_7_day_alert',
    r'^cumidx_14_day_alert_.*': 'cumidx_14_day_alert',
    r'^cumidx_30_day_alert_.*': 'cumidx_30_day_alert',
    r'^cumidx_all_alert_.*': 'cumidx_all_alert',
    r'^cumidx_2_day_dx_.*': 'cumidx_2_day_dx',
    r'^cumidx_7_day_dx_.*': 'cumidx_7_day_dx',
    r'^cumidx_14_day_dx_.*': 'cumidx_14_day_dx',
    r'^cumidx_30_day_dx_.*': 'cumidx_30_day_dx',
    r'^cumidx_all_dx_.*': 'cumidx_all_dx',
    r'^cumidx_2_day_med_.*': 'cumidx_2_day_med',
    r'^cumidx_7_day_med_.*': 'cumidx_7_day_med',
    r'^cumidx_14_day_med_.*': 'cumidx_14_day_med',
    r'^cumidx_30_day_med_.*': 'cumidx_30_day_med',
    r'^cumidx_all_med_.*': 'cumidx_all_med',
    r'^cumidx_2_day_order_.*': 'cumidx_2_day_order',
    r'^cumidx_7_day_order_.*': 'cumidx_7_day_order',
    r'^cumidx_14_day_order_.*': 'cumidx_14_day_order',
    r'^cumidx_30_day_order_.*': 'cumidx_30_day_order',
    r'^cumidx_all_order_.*': 'cumidx_all_order',
    r'^cumidx_2_day_labs_.*': 'cumidx_2_day_labs',
    r'^cumidx_7_day_labs_.*': 'cumidx_7_day_labs',
    r'^cumidx_14_day_labs_.*': 'cumidx_14_day_labs',
    r'^cumidx_30_day_labs_.*': 'cumidx_30_day_labs',
    r'^cumidx_all_labs_.*': 'cumidx_all_labs',
}

In [None]:
f = open ('./feature_groups.json', "r")
feature_groups = json.loads(f.read())
# Not the most efficient code but not optimizing since the cell runs pretty fast
def get_feature_group_counts():
    training_feats = base_df.columns
    features = {}
    for grp in feature_groups:
        features[grp] = len([x for x in training_feats if x in feature_groups[grp]])
    return features


In [None]:
def get_cumulative_group_counts():
    training_feats = pd.DataFrame({'feature': list(base_df.columns)})
    training_feats['feature_group'] = training_feats.feature.replace(
            CUMULATIVE_GROUP_MAPPING,
            regex=True
        )
    features = training_feats.groupby('feature_group')['feature_group'].count().to_dict()
    cumulative_cols = CUMULATIVE_GROUP_MAPPING.values()
    features = {k: features.get(k, 0) for k in cumulative_cols}

    return features


In [None]:
feature_drop_stats = {}
cumulative_feature_drop_stats = {}

feature_group_count = get_feature_group_counts()
cumulative_group_count = get_cumulative_group_counts()
for grp in feature_groups:
    feature_drop_stats[grp] = {'before_drop_count': feature_group_count[grp]}
    
for grp in cumulative_group_count:
    cumulative_feature_drop_stats[grp] = {'before_drop_count': cumulative_group_count[grp]}
feature_drop_stats

### Drop all columns with 100% Null values except for Idens Columns

In [None]:
IDEN_COLS = ['censusdate', 'facilityid', 'masterpatientid', 'LFS', 'primaryphysicianid',
         'payername', 'to_from_type', 'client', 'admissionstatus',
         f'positive_date_{MODEL_TYPE}', f'target_3_day_{MODEL_TYPE}']

In [None]:
non_idens_cols_all_null = [col for col in base_df.columns if base_df[col].isnull().all() and col not in IDEN_COLS]
non_idens_cols_all_null

In [None]:
base_df.drop(non_idens_cols_all_null, axis=1, inplace=True)

### Drop columns with single value in it

In [None]:
cols_with_single_value = []
for col in base_df.columns:
    if len(base_df[col].value_counts()) == 1 and base_df[col].value_counts().iloc[0] == len(base_df) and col not in IDEN_COLS:
        cols_with_single_value.append(col)
# cols_with_single_value

In [None]:
base_df.drop(cols_with_single_value,inplace=True,axis=1)

In [None]:
cols_to_drop = {'single_valued_columns': cols_with_single_value, 'all_null_columns': non_idens_cols_all_null}

# Dump the merged dictionary into the JSON file
with open('all_null_dropped_col_names.json', 'w') as json_file:
    json.dump(cols_to_drop, json_file)

### Remove features which have 100% 0 values

In [None]:
def na_analysis(df):
    lst = []
    cols = []
    total_rows = df.shape[0]
    cols = df.columns[df.columns.str.contains('cumidx|cumsum|days_since_last_event|na_indictator|vtl_|notes_')]
    for col in cols:
        # Sum of NaN values in a column
        na_values = max(df[col].eq(0).sum(), df[col].eq(9999).sum(), df[col].isnull().sum())
        lst.extend([[col,total_rows,na_values,(na_values/total_rows)*100]])
        if ((na_values/total_rows)*100) >= 99 and (col not in cols):
            cols.append(col)

    return lst


df_na = pd.DataFrame(
    na_analysis(base_df),
    columns=['column_name','total_count','null_values','%_null_values']
)

df_na.sort_values(['%_null_values'],ascending=False,inplace=True)

df_na.head(10)

In [None]:
print(base_df.shape)

In [None]:
drop_cols = df_na[
    (df_na['%_null_values'] >=99.9) & (~df_na['column_name'].str.startswith('hosp_target'))
]['column_name']
base_df.drop(drop_cols,
        axis=1,
        inplace=True
       )

In [None]:
len(drop_cols)

In [None]:
base_df.shape

In [None]:
get_cumulative_group_counts()

In [None]:
feature_group_count = get_feature_group_counts()
cumulative_group_count = get_cumulative_group_counts()

total_before_drop = 0
total_after_drop = 0
for grp in feature_groups:
    feature_drop_stats[grp]['after_drop_count'] = feature_group_count[grp]
    dropped_percentage = (feature_drop_stats[grp]['before_drop_count'] - feature_drop_stats[grp]['after_drop_count'])/feature_drop_stats[grp]['before_drop_count']
    feature_drop_stats[grp]['dropped_percentage'] = "{:.0%}".format(dropped_percentage)
    total_before_drop += feature_drop_stats[grp]['before_drop_count']
    total_after_drop += feature_drop_stats[grp]['after_drop_count']
dropped_percentage = (total_before_drop-total_after_drop)/total_before_drop
feature_drop_stats['Total'] = {'before_drop_count': total_before_drop, 'after_drop_count': total_after_drop, 'dropped_percentage': "{:.0%}".format(dropped_percentage)}

total_before_drop = 0
total_after_drop = 0
for grp in cumulative_group_count:
    cumulative_feature_drop_stats[grp]['after_drop_count'] = cumulative_group_count[grp]
    if cumulative_feature_drop_stats[grp]['before_drop_count'] > 0:
        dropped_percentage = (cumulative_feature_drop_stats[grp]['before_drop_count'] - cumulative_feature_drop_stats[grp]['after_drop_count'])/cumulative_feature_drop_stats[grp]['before_drop_count']
    else:
        dropped_percentage = 0
    cumulative_feature_drop_stats[grp]['dropped_percentage'] = "{:.0%}".format(dropped_percentage)
    total_before_drop += cumulative_feature_drop_stats[grp]['before_drop_count']
    total_after_drop += cumulative_feature_drop_stats[grp]['after_drop_count']
if total_before_drop > 0:
    dropped_percentage = (total_before_drop-total_after_drop)/total_before_drop
else:
    dropped_percentage = 0
cumulative_feature_drop_stats['Total'] = {'before_drop_count': total_before_drop, 'after_drop_count': total_after_drop, 'dropped_percentage': "{:.0%}".format(dropped_percentage)}

print(cumulative_feature_drop_stats)

with open('./feature_drop_stats.json', 'w') as outfile: json.dump(feature_drop_stats, outfile)
with open('./cumulative_feature_drop_stats.json', 'w') as outfile: json.dump(cumulative_feature_drop_stats, outfile)

In [None]:
## Write feature_drop_stats and cumulative_feature_drop_stats as ascii tables

In [None]:
x = PrettyTable()
x.title = 'Feature Group Drop Stats'
x.field_names = ["Feature Group", "Before Feature Reduction", "After Feature Reduction", "% of Dropped Features"]
# To make sure the groups are in alphabetical order
grps = list(feature_drop_stats.keys())
total = grps.pop()
grps = sorted(grps) + [total]
for grp in grps:
        x.add_row([grp, feature_drop_stats[grp]['before_drop_count'], feature_drop_stats[grp]['after_drop_count'], feature_drop_stats[grp]['dropped_percentage']])

with open('./feature_group_drop_stats.txt', 'w') as w:
    w.write(str(x))

In [None]:
x = PrettyTable()
x.title = 'Feature Cumulative Group Drop Stats'
x.field_names = ["Feature Group", "Before Feature Reduction", "After Feature Reduction", "% of Dropped Features"]
for grp in cumulative_feature_drop_stats:
        x.add_row([grp, cumulative_feature_drop_stats[grp]['before_drop_count'], cumulative_feature_drop_stats[grp]['after_drop_count'], cumulative_feature_drop_stats[grp]['dropped_percentage']])
        
with open('./feature_cumulative_drop_stats.txt', 'w') as w:
    w.write(str(x))

In [None]:
print(base_df.shape)

In [None]:
base_df.to_parquet(processed_path/'final_cleaned_df.parquet')

## =======================END=====================

In [None]:
# base_df = pd.read_parquet(processed_path/'05-result.parquet')


In [None]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
import pickle

### Pre-process the dataframe before running feature selection

In [None]:
output_cols = [col for col in base_df.columns if 'hosp_target' in col]
x_df = base_df[base_df.columns.difference(output_cols)]
y_df = base_df[output_cols]
print(x_df.shape)
print(y_df.shape)

In [None]:
exclude_cols = ['masterpatientid','censusdate', 'facilityid', 'bedid', 'client']

x_df = x_df[x_df.columns.difference(exclude_cols)]
x_df.shape

y_df = y_df.fillna(False)
y_df['hosp_target_3_day_hosp'] = y_df['hosp_target_3_day_hosp'].astype('float32')
target_3_day = y_df['hosp_target_3_day_hosp']

In [None]:
def fill_na_train(df):
    # Fill Median value for all NaN's in the respective columns
    has_na = df.isna().sum() > 0
    d = df.loc[:, has_na].median()
    df = df.fillna(d)
    
    return df, d

def fill_na_valid_or_test(df, na_filler):
    return df.fillna(na_filler)


x_df, na_filler = fill_na_train(x_df)
x_df = x_df.astype('float32')

In [None]:
print(x_df.shape)
print(y_df.shape)
print(len(target_3_day))

In [None]:
x_df.to_parquet(processed_path/'x_df.parquet')
with open(processed_path/'target_3_day.pickle','wb') as f: pickle.dump(target_3_day, f, protocol=4)

In [None]:
x_df = pd.read_parquet(processed_path/'x_df.parquet')
with open(processed_path/'target_3_day.pickle','rb') as f: target_3_day = pickle.load(f)

## Feature Selection 

In [None]:
%%time

# Correlation for all features with the target

corr_matrix = x_df.corrwith(y_df['hosp_target_3_day_hosp'])

_df = pd.DataFrame({'cols':corr_matrix.index, 'value':corr_matrix.values})
_df.sort_values(by='value',ascending=False).head(2000)

In [None]:
%%time
## Remove constant features

constant_features = []
for feat in x_df.columns:
    # convert all features to Float32
    
    if x_df[feat].std() == 0:
        constant_features.append(feat)

print(constant_features)

# x_df.drop(labels=constant_features, axis=1, inplace=True)


In [None]:
%%time

# Remove duplicated features

duplicated_features = []
for i in range(0, len(x_df.columns)):
    col_1 = x_df.columns[i]

    for col_2 in x_df.columns[i + 1:]:
        if x_df[col_1].equals(x_df[col_2]):
            duplicated_features.append(col_2)

print(duplicated_features)

# x_df.drop(labels=duplicated_features, axis=1, inplace=True)

In [None]:
%%time


# calculate the mutual information between the variables and the target
# this returns the mutual information value of each feature.
# the smaller the value the less information the feature has about the target


mi = mutual_info_classif(x_df.fillna(0), target_3_day)
print(mi)

# let's add the variable names and order the features
# according to the MI for clearer visualisation
mi = pd.Series(mi)
mi.index = x_df.columns
mi = mi.sort_values(ascending=False)
mi.to_csv('mi-date_cols.csv', header=True)
# and now let's plot the ordered MI values per feature
mi.sort_values(ascending=False).plot.bar(figsize=(20, 8))

In [None]:
%%time

# here I will select the top 10 features
# which are shown below
sel_ = SelectKBest(mutual_info_classif, k=10).fit(x_df.fillna(0), target_3_day)
x_df.columns[sel_.get_support()]

In [None]:
%%time

# calculate the chi2 p_value between each of the variables
# and the target
# it returns 2 arrays, one contains the F-Scores which are then
# evaluated against the chi2 distribution to obtain the pvalue
# the pvalues are in the second array, see below

f_score = chi2(x_df, target_3_day)
f_score

Keep in mind, that contrarily to MI, where we were interested in the higher MI values,
for Fisher score, the smaller the p_value, the more significant the feature is to predict the target.

One thing to keep in mind when using Fisher score or univariate selection methods,
is that in very big datasets, most of the features will show a small p_value,
and therefore look like they are highly predictive.
This is in fact an effect of the sample size. So care should be taken when selecting features
using these procedures. An ultra tiny p_value does not highlight an ultra-important feature,
it rather indicates that the dataset contains too many samples.

If the dataset contained several categorical variables, we could then combine this procedure with
SelectKBest or SelectPercentile, as I did in the previous lecture.

In [None]:
%%time

# let's add the variable names and order it for clearer visualisation

pvalues = pd.Series(f_score[1])
pvalues.index = x_df.columns
pvalues.sort_values(ascending=True)


In [None]:
%%time

# LASSO Regularization

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

logistic = LogisticRegression(C=1, penalty='l1',solver='liblinear',random_state=7).fit(x_df,target_3_day)
model = SelectFromModel(logistic, prefit=True)

# x_new_df = model.transform(x_df)

# this command let's me visualise those features that were kept
model.get_support()

In [None]:
%%time

# Now I make a list with the selected features
selected_feat = x_df.columns[(model.get_support())]

print('total features: {}'.format((x_df.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(model.estimator_.coef_ == 0)))



In [None]:
%%time

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# step forward feature selection
# I indicate that I want to select 10 features from
# the total, and that I want to select those features
# based on the optimal roc_auc

sfs1 = SFS(RandomForestRegressor(),
           k_features=20,
           forward=True,
           floating=False,
           verbose=2,
           scoring='r2',
           cv=3)

sfs1 = sfs1.fit(np.array(x_df), target_3_day)
selected_feat= x_df.columns[list(sfs1.k_feature_idx_)]
selected_feat

In [None]:
!pip install mlxtend