In [6]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# UCI Dataset Preprocessing

In [7]:
RAW_DATA_FOLDER = 'data/raw'
CLEAN_DATA_FOLDER = 'data/preprocessed'

def preprocess(train_df, 
               numerical_cols, category_cols, preserved_cols):
    to_concat = []
    
    # numerical cols
    if len(numerical_cols) > 0:
        train_df[numerical_cols] = train_df[numerical_cols].astype(float)

        mu = train_df[numerical_cols].mean(axis=0)
        std = train_df[numerical_cols].std(axis=0)

        train_nums = (train_df[numerical_cols] - mu)/std
        
        to_concat.append(train_nums)

    # categorical cols
    if len(category_cols) > 0:
        train_cats = pd.get_dummies(
            train_df[category_cols], 
            dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
        to_concat.append(train_cats)
    
    # preserved_cols
    if len(preserved_cols) > 0:
        train_preserved = train_df[preserved_cols]
        to_concat.append(train_preserved)
    
    preprocessed_train_df = pd.concat(to_concat, axis=1)
    preprocessed_train_df = preprocessed_train_df.fillna(0)
    
    return preprocessed_train_df

## Adult dataset

In [8]:
NAME = 'adult'

attributes = """
age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
"""
cols = [s.split(':')[0] for s in attributes.split('\n')] + ['label']
cols = [c for c in cols if len(c) > 0]

train_df = pd.read_csv(f'{RAW_DATA_FOLDER}/adult/adult.data', names=cols, index_col=False)
test_df = pd.read_csv(f'{RAW_DATA_FOLDER}/adult/adult.test', names=cols, index_col=False).drop(0)
combined_df = pd.concat([train_df, test_df])

In [9]:
combined_df.shape

(48842, 15)

In [10]:
# preprocessing
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
category_cols = [
    'workclass', 'education', 'marital-status', 'occupation', 
    'relationship', 'race', 'sex', 'native-country'
]
preserved_cols = ['label']

combined_df[category_cols] = combined_df[category_cols].replace({'\?': np.nan}, regex=True)

# original missingness
print(combined_df.isna().sum(axis=0) / len(combined_df))

age               0.000000
workclass         0.057307
fnlwgt            0.000000
education         0.000000
education-num     0.000000
marital-status    0.000000
occupation        0.057512
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
native-country    0.017546
label             0.000000
dtype: float64


In [11]:
combined_df = preprocess(combined_df, numerical_cols, category_cols, preserved_cols)
combined_df['label'] = combined_df['label'].replace({' <=50K': 0, ' >50K': 1, ' <=50K.': 0, ' >50K.': 1})

In [12]:
display(combined_df.head())
combined_df.to_csv(f'{CLEAN_DATA_FOLDER}/{NAME}_clean.csv', index=False)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital-status_ Divorced,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,native-country_ Cambodia,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,native-country_ Dominican-Republic,native-country_ Ecuador,native-country_ El-Salvador,native-country_ England,native-country_ France,native-country_ Germany,native-country_ Greece,native-country_ Guatemala,native-country_ Haiti,native-country_ Holand-Netherlands,native-country_ Honduras,native-country_ Hong,native-country_ Hungary,native-country_ India,native-country_ Iran,native-country_ Ireland,native-country_ Italy,native-country_ Jamaica,native-country_ Japan,native-country_ Laos,native-country_ Mexico,native-country_ Nicaragua,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Peru,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,label
0,0.025996,-1.061968,1.1365,0.146931,-0.217125,-0.034087,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0.8283,-1.007094,1.1365,-0.144802,-0.217125,-2.213009,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,-0.046941,0.246031,-0.419331,-0.144802,-0.217125,-0.034087,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1.04711,0.426659,-1.197247,-0.144802,-0.217125,-0.034087,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,-0.776309,1.408515,1.1365,-0.144802,-0.217125,-0.034087,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Bank

In [15]:
NAME = 'bank'
cols = [
    'age', # (numeric)
    'job ', # type of job (categorical', # "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
    'marital', # marital status (categorical) divorced","married","single","unknown"; note', # "divorced" means divorced or widowed)
    'education', # (categorical "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
    'default', # has credit in default? (categorical', # "no","yes","unknown")
    'housing', # has housing loan? (categorical', # "no","yes","unknown")
    'loan', # has personal loan? (categorical', # "no","yes","unknown")
    # related with the last contact of the current campaign
    'contact', # contact communication type (categorical', # "cellular","telephone") 
    'month', # last contact month of year (categorical', # "jan", "feb", "mar", ..., "nov", "dec")
    'day_of_week', # last contact day of the week (categorical', # "mon","tue","wed","thu","fri")
    'duration', # last contact duration, in seconds (numeric). Important note', #  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
    # other attributes
    'campaign', # number of contacts performed during this campaign and for this client (numeric, includes last contact)
    'pdays', # number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
    'previous', # number of contacts performed before this campaign and for this client (numeric)
    'poutcome', # outcome of the previous marketing campaign (categorical', # "failure","nonexistent","success")
    # social and economic context attributes
    'emp.var.rate', # employment variation rate - quarterly indicator (numeric)
    'cons.price.idx', # consumer price index - monthly indicator (numeric)     
    'cons.conf.idx', # consumer confidence index - monthly indicator (numeric)     
    'euribor3m', # euribor 3 month rate - daily indicator (numeric)
    'nr.employed', # number of employees - quarterly indicator (numeric)
    'label', # (desired target) has the client subscribed a term deposit? (binary', # "yes","no")
]

In [16]:
df = pd.read_csv(f'{RAW_DATA_FOLDER}/{NAME}/bank-additional-full.csv',
                quoting=True, sep=';')
df = df.replace({'unknown': np.nan})
print(len(df))
print('Original Missingness: \n', df.isna().sum(axis=0) / len(df))
df = df.rename({'y': 'label', 'contact': 'cellular_contact'}, axis=1)
df['cellular_contact'] = df['cellular_contact'].replace({'telephone': 0, 'cellular': 1})

binary_cols = ['default', 'housing', 'loan', 'label']
df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0})

numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'euribor3m', 'nr.employed']
category_cols = ['job', 'marital', 'education', 'month', 'day_of_week', 'poutcome']
preserved_cols = ['cellular_contact'] + binary_cols
df = preprocess(df, numerical_cols, category_cols, preserved_cols)
df.to_csv(f'{CLEAN_DATA_FOLDER}/{NAME}_clean.csv', index=False)

41188
Original Missingness: 
 age               0.000000
job               0.008012
marital           0.001942
education         0.042027
default           0.208726
housing           0.024036
loan              0.024036
contact           0.000000
month             0.000000
day_of_week       0.000000
duration          0.000000
campaign          0.000000
pdays             0.000000
previous          0.000000
poutcome          0.000000
emp.var.rate      0.000000
cons.price.idx    0.000000
cons.conf.idx     0.000000
euribor3m         0.000000
nr.employed       0.000000
y                 0.000000
dtype: float64


## Thyroid

In [29]:
NAME = 'thyroid'
tasknames = [
    'allbp', # increased binding protein, decreased binding protein, negative
]
cols = [
    'age', #continuous.
    'sex', #M, F.
    'on thyroxine', #f, t.
    'query on thyroxine', #f, t.
    'on antithyroid medication', #f, t.
    'sick', #f, t.
    'pregnant', #f, t.
    'thyroid surgery', #f, t.
    'I131 treatment', #f, t.
    'query hypothyroid', #f, t.
    'query hyperthyroid', #f, t.
    'lithium', #f, t.
    'goitre', #f, t.
    'tumor', #f, t.
    'hypopituitary', #f, t.
    'psych', #f, t.
    'TSH measured', #f, t.
    'TSH', #continuous.
    'T3 measured', #f, t.
    'T3', #continuous.
    'TT4 measured', #f, t.
    'TT4', #continuous.
    'T4U measured', #f, t.
    'T4U', #continuous.
    'FTI measured', #f, t.
    'FTI', #continuous.
    'TBG measured', #f, t.
    'TBG', #continuous.
    'referral source', #WEST, STMW, SVHC, SVI, SVHD, other.
    'label',
]

In [30]:
for task in tasknames:
    print(f'================ {task} ================')
    df = pd.read_csv(f'{RAW_DATA_FOLDER}/{NAME}/{task}.data',
                     names=cols, index_col=False)
    print(len(df))

    df = df.replace({'?': np.nan})
    print('Original Missingness: \n', df.isna().sum(axis=0) / len(df))
    df['label_cat'] = df.apply(lambda x: x['label'].split('.|')[0], axis=1)
    df['label'] = df.apply(lambda x: int(x['label_cat'] != 'negative'), axis=1)

    binary_cols = [
        'sex', #M, F.
        'on thyroxine', #f, t.
        'query on thyroxine', #f, t.
        'on antithyroid medication', #f, t.
        'sick', #f, t.
        'pregnant', #f, t.
        'thyroid surgery', #f, t.
        'I131 treatment', #f, t.
        'query hypothyroid', #f, t.
        'query hyperthyroid', #f, t.
        'lithium', #f, t.
        'goitre', #f, t.
        'tumor', #f, t.
        'hypopituitary', #f, t.
        'psych', #f, t.
        'TSH measured', #f, t.
        'T3 measured', #f, t.
        'TT4 measured', #f, t.
        'T4U measured', #f, t.
        'FTI measured', #f, t.
        'TBG measured', #f, t.
    ]
    df[binary_cols] = df[binary_cols].replace({'t': 1, 'f': 0, 
                                               'M': 1, 'F': 0})

    numerical_cols = [
        'age', #continuous.
        'TSH', #continuous.
        'T3', #continuous.
        'TT4', #continuous.
        'T4U', #continuous.
        'FTI', #continuous.
        'TBG', #continuous.
    ]
    category_cols = ['referral source']
    preserved_cols = binary_cols + ['label']
    # df[category_cols] = df[category_cols].astype(str)
    df = preprocess(df, numerical_cols, category_cols, preserved_cols)
    df.to_csv(f'{CLEAN_DATA_FOLDER}/{NAME}_{task}_clean.csv', index=False)

2800
Original Missingness: 
 age                          0.000357
sex                          0.039286
on thyroxine                 0.000000
query on thyroxine           0.000000
on antithyroid medication    0.000000
sick                         0.000000
pregnant                     0.000000
thyroid surgery              0.000000
I131 treatment               0.000000
query hypothyroid            0.000000
query hyperthyroid           0.000000
lithium                      0.000000
goitre                       0.000000
tumor                        0.000000
hypopituitary                0.000000
psych                        0.000000
TSH measured                 0.000000
TSH                          0.101429
T3 measured                  0.000000
T3                           0.208929
TT4 measured                 0.000000
TT4                          0.065714
T4U measured                 0.000000
T4U                          0.106071
FTI measured                 0.000000
FTI                  

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,label
0,-0.529991,-0.157214,0.576078,0.450028,0.730941,-0.054372,0.0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0
1,-1.409706,-0.026674,-0.030277,-0.199828,0.0,0.0,0.0,0,0,0,0,1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,-0.285626,-0.172133,0.0,-0.002046,-0.452245,0.280137,0.0,0,0,0,0,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
3,0.887328,-0.210362,-0.151548,1.862759,0.0,0.0,0.0,0,0,0,0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
4,0.887328,-0.184254,-1.000444,-1.358267,-0.658016,-1.24036,0.0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0


Training AUC:  0.8984469046632327
2800
Original Missingness: 
 age                          0.000357
sex                          0.039286
on thyroxine                 0.000000
query on thyroxine           0.000000
on antithyroid medication    0.000000
sick                         0.000000
pregnant                     0.000000
thyroid surgery              0.000000
I131 treatment               0.000000
query hypothyroid            0.000000
query hyperthyroid           0.000000
lithium                      0.000000
goitre                       0.000000
tumor                        0.000000
hypopituitary                0.000000
psych                        0.000000
TSH measured                 0.000000
TSH                          0.101429
T3 measured                  0.000000
T3                           0.208929
TT4 measured                 0.000000
TT4                          0.065714
T4U measured                 0.000000
T4U                          0.106071
FTI measured             

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,label
0,-0.529991,-0.157214,0.576078,0.450028,0.730941,-0.054372,0.0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0
1,-1.409706,-0.026674,-0.030277,-0.199828,0.0,0.0,0.0,0,0,0,0,1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,-0.285626,-0.172133,0.0,-0.002046,-0.452245,0.280137,0.0,0,0,0,0,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
3,0.887328,-0.210362,-0.151548,1.862759,0.0,0.0,0.0,0,0,0,0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
4,0.887328,-0.184254,-1.000444,-1.358267,-0.658016,-1.24036,0.0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0


Training AUC:  0.9809320316114294
2800
Original Missingness: 
 age                          0.000357
sex                          0.039286
on thyroxine                 0.000000
query on thyroxine           0.000000
on antithyroid medication    0.000000
sick                         0.000000
pregnant                     0.000000
thyroid surgery              0.000000
I131 treatment               0.000000
query hypothyroid            0.000000
query hyperthyroid           0.000000
lithium                      0.000000
goitre                       0.000000
tumor                        0.000000
hypopituitary                0.000000
psych                        0.000000
TSH measured                 0.000000
TSH                          0.101429
T3 measured                  0.000000
T3                           0.208929
TT4 measured                 0.000000
TT4                          0.065714
T4U measured                 0.000000
T4U                          0.106071
FTI measured             

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,label
0,-0.529991,-0.157214,0.576078,0.450028,0.730941,-0.054372,0.0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0
1,-1.409706,-0.026674,-0.030277,-0.199828,0.0,0.0,0.0,0,0,0,0,1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,-0.285626,-0.172133,0.0,-0.002046,-0.452245,0.280137,0.0,0,0,0,0,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
3,0.887328,-0.210362,-0.151548,1.862759,0.0,0.0,0.0,0,0,0,0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
4,0.887328,-0.184254,-1.000444,-1.358267,-0.658016,-1.24036,0.0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0


Training AUC:  0.9420190274841437
2800
Original Missingness: 
 age                          0.000357
sex                          0.039286
on thyroxine                 0.000000
query on thyroxine           0.000000
on antithyroid medication    0.000000
sick                         0.000000
pregnant                     0.000000
thyroid surgery              0.000000
I131 treatment               0.000000
query hypothyroid            0.000000
query hyperthyroid           0.000000
lithium                      0.000000
goitre                       0.000000
tumor                        0.000000
hypopituitary                0.000000
psych                        0.000000
TSH measured                 0.000000
TSH                          0.101429
T3 measured                  0.000000
T3                           0.208929
TT4 measured                 0.000000
TT4                          0.065714
T4U measured                 0.000000
T4U                          0.106071
FTI measured             

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,label
0,-0.529991,-0.157214,0.576078,0.450028,0.730941,-0.054372,0.0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0
1,-1.409706,-0.026674,-0.030277,-0.199828,0.0,0.0,0.0,0,0,0,0,1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,-0.285626,-0.172133,0.0,-0.002046,-0.452245,0.280137,0.0,0,0,0,0,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
3,0.887328,-0.210362,-0.151548,1.862759,0.0,0.0,0.0,0,0,0,0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1
4,0.887328,-0.184254,-1.000444,-1.358267,-0.658016,-1.24036,0.0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0


Training AUC:  0.9798543411670503
2800
Original Missingness: 
 age                          0.000357
sex                          0.039286
on thyroxine                 0.000000
query on thyroxine           0.000000
on antithyroid medication    0.000000
sick                         0.000000
pregnant                     0.000000
thyroid surgery              0.000000
I131 treatment               0.000000
query hypothyroid            0.000000
query hyperthyroid           0.000000
lithium                      0.000000
goitre                       0.000000
tumor                        0.000000
hypopituitary                0.000000
psych                        0.000000
TSH measured                 0.000000
TSH                          0.101429
T3 measured                  0.000000
T3                           0.208929
TT4 measured                 0.000000
TT4                          0.065714
T4U measured                 0.000000
T4U                          0.106071
FTI measured             

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,label
0,-0.529991,-0.157214,0.576078,0.450028,0.730941,-0.054372,0.0,0,1,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0
1,-1.409706,-0.026674,-0.030277,-0.199828,0.0,0.0,0.0,0,0,0,0,1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
2,-0.285626,-0.172133,0.0,-0.002046,-0.452245,0.280137,0.0,0,0,0,0,1,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
3,0.887328,-0.210362,-0.151548,1.862759,0.0,0.0,0.0,0,0,0,0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
4,0.887328,-0.184254,-1.000444,-1.358267,-0.658016,-1.24036,0.0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0


Training AUC:  0.9075700746118169
