In [45]:
from imodels.util import data_util
import pandas as pd
import numpy as np

In [56]:
metadata = []
metadata_columns = ['Name', 'Patients', 'Outcome', '% Outcome', 'Features']
feature_data = []
featuredata_columns = ['Feature Name', '% Missing or N/A', '% Nonzero']#, 'Imputation']
for dset_name in ['tbi', 'iai', 'csi']:
    X, y, feat_names = data_util.get_clean_dataset(f'{dset_name}_pecarn_pred.csv', data_source='imodels')
    X_df = pd.DataFrame(X, columns=feat_names)

    X_prop, y_prop, feature_names_prop = data_util.get_clean_dataset(f'{dset_name}_pecarn_prop.csv', data_source='imodels')
    X_df_prop = pd.DataFrame(X_prop, columns=feature_names_prop)

    if dset_name == 'tbi':
        X_df_clean = X_df.drop(columns=['AgeinYears', 'AgeTwoPlus'])
    elif dset_name == 'iai':
        X_df_clean = X_df.drop(columns=['Age<2_no', 'Age<2_yes'])
    else:
        X_df_clean = X_df
    
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    unique_feats = np.unique(['_'.join(name.split('_')[:-1]) if len(name.split('_')) > 1 else name 
        for name in X_df_clean.columns])

    metadata.append([dset_name.capitalize(), shape[0], class_counts[1], 
        np.round(class_counts[1] * 100 / np.sum(class_counts), decimals=1), unique_feats.shape[0]])
    
    for feat in unique_feats:
        missing = 0
        # Values that represent missing or not applicable 
        for suffix in ['nan', '91.0', '92', '92.0', 'unknown']:
            if f'{feat}_{suffix}' in X_df_prop.columns:
                missing += X_df_prop[f'{feat}_{suffix}'].value_counts()[1.0] * 100 / X_df.shape[0]
        missing = np.round(missing, decimals=2)

        
        nonzero = 0
        for suffix in ['yes', '1.0', 'Yes']:
            if f'{feat}_{suffix}' in X_df.columns:
                nonzero += X_df[f'{feat}_{suffix}'].value_counts()[1.0] * 100 / X_df.shape[0]
        if feat in X_df.columns and X_df[feat].unique().shape[0] < 3:
            nonzero += X_df[feat].value_counts()[1.0] * 100 / X_df.shape[0]
        
        if nonzero != 0:
            nonzero = np.round(nonzero, decimals=2)
        else:
            nonzero = 'N/A'

        feature_data.append([feat, missing, nonzero])


metadata = pd.DataFrame(metadata, columns=metadata_columns)#.sort_values(by=['Patients'])  #.set_index('Name')
feature_data = pd.DataFrame(feature_data, columns=featuredata_columns)#.sort_values(by=['Patients'])  #.set_index('Name')

In [54]:
print(metadata.to_latex(index=False, escape=False).replace('%', '\%'))

\begin{tabular}{lrrrr}
\toprule
Name &  Patients &  Outcome &  \% Outcome &  Features \\
\midrule
 Tbi &     42428 &      376 &        0.9 &        61 \\
 Iai &     12044 &      203 &        1.7 &        21 \\
 Csi &      3313 &      540 &       16.3 &        35 \\
\bottomrule
\end{tabular}



In [80]:
fd_left = feature_data[:61].reset_index()
fd_right = feature_data[62:]

In [81]:
print(fd_left.to_latex(index=False, escape=False))#.replace('%', '\%'))

\begin{tabular}{rlrl}
\toprule
 index &       Feature Name &  % Missing or N/A & % Nonzero \\
\midrule
     0 &                AMS &              0.74 &     12.95 \\
     1 &        AMSAgitated &             87.05 &      1.79 \\
     2 &             AMSOth &             87.05 &      1.82 \\
     3 &          AMSRepeat &             87.05 &      1.04 \\
     4 &           AMSSleep &             87.05 &      6.67 \\
     5 &            AMSSlow &             87.05 &      3.22 \\
     6 &            ActNorm &              7.09 &     85.38 \\
     7 &         AgeInMonth &              0.00 &       N/A \\
     8 &       Amnesia_verb &             38.41 &     10.45 \\
     9 &               Clav &              0.30 &     64.38 \\
    10 &           ClavFace &             35.92 &     29.99 \\
    11 &            ClavFro &             35.92 &     20.48 \\
    12 &           ClavNeck &             35.92 &      1.38 \\
    13 &            ClavOcc &             35.92 &      9.62 \\
    14 &       

In [25]:
# X, y, feature_names = data_util.get_clean_dataset(f'tbi_pecarn_pred.csv', data_source='imodels')
# X_df = pd.DataFrame(X, columns=feature_names)

In [26]:
# X_prop, y_prop, feature_names_prop = data_util.get_clean_dataset(f'tbi_pecarn_prop.csv', data_source='imodels')
# X_df_prop = pd.DataFrame(X_prop, columns=feature_names_prop)

In [82]:
# X_df_prop.columns.tolist()
# # X_df_prop.columns.tolist()
# # (X_df['subinj_Head2'] + X_df['subinj_Face2']).value_counts()

In [83]:
# for col in [ X_df_prop[col].value_counts()[1.0] for col in X_df.columns if 'OSI' in col]:
#     X_df_prop[col].value_counts()[1.0]
# (X_df[[col for col in X_df.columns if 'OSI' in col]].sum(axis=1) > 0).value_counts()

In [84]:
# [col for col in X_df.columns if 'OSI' in col][1:]

In [85]:
# X_df['OSI'].value_counts()

In [86]:
# [col for col in X_df_prop.columns if 'Amnesia' in col]

In [87]:
# unique_feats = np.unique([''.join(name.split('_')[:-1]) if len(name.split('_')) > 1 and 'subinj' not in name else name for name in X_df.columns])

In [88]:
# for feat in unique_feats:
#     if f'{feat}_nan' in X_df_prop.columns:
#         print(f'{feat}_nan', X_df_prop[f'{feat}_nan'].value_counts()[1.0])

In [89]:
# X_df_prop['Vomit_nan'].value_counts()