In [29]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

sys.path.append(os.path.abspath(".."))
from common import common

In [30]:
base_path = '/Users/suyeetan/Downloads/CS5344_Project/work/'

In [31]:
def get_thyroid_df():
    config = {
        'TARGET_COLUMN': 'target',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/thyroid/thyroidDF.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [39]:
def map_column_values(df):
    df_mapped = df.copy()
    columns = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick', 'pregnant', 
               'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 
               'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 
               'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source']
    for col in columns:
        if col == 'sex':
            df_mapped['sex'] = df_mapped['sex'].map({'M': 1, 'F': 0})
        elif col == 'referral_source':
            df_mapped['referral_source'] = df_mapped['referral_source'].map({'SVI': 1, 'other': 0})
        else:
            df_mapped[col] = df_mapped[col].map({'t': 1, 'f': 0})
    return df_mapped

def fill_na(df):
    df_filled = df.copy()
    df_filled['sex'] = df_filled['sex'].fillna(0)
    df_filled['referral_source'] = df_filled['referral_source'].fillna(0)
    df_filled['TSH'] = df_filled['TSH'].fillna(df_filled['TSH'].mean())
    df_filled['T3'] = df_filled['T3'].fillna(df_filled['T3'].mean())
    df_filled['TT4'] = df_filled['TT4'].fillna(df_filled['TT4'].mean())
    df_filled['T4U'] = df_filled['T4U'].fillna(df_filled['T4U'].mean())
    df_filled['FTI'] = df_filled['FTI'].fillna(df_filled['FTI'].mean())
    df_filled['TBG'] = df_filled['TBG'].fillna(df_filled['TBG'].mean())
    return df_filled

def get_processed_thyroid_df():
    all_df, main_labels, config = get_thyroid_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Drop column
    # all_df.dropna(inplace=True)
    all_df = all_df.drop(columns='patient_id')

    # Map values
    all_df[target_column] = all_df[target_column].apply(lambda x:"None" if x == "-" else x)
    all_df = map_column_values(all_df)
    all_df = fill_na(all_df)
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])

    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['None']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    # ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [40]:
df,m,c=get_processed_thyroid_df()

Normal class:  0    -
Name: target, dtype: object
TARGET_DICT {0: 'A', 1: 'AK', 2: 'B', 3: 'C', 4: 'C|I', 5: 'D', 6: 'D|R', 7: 'E', 8: 'F', 9: 'FK', 10: 'G', 11: 'GI', 12: 'GK', 13: 'GKJ', 14: 'H|K', 15: 'I', 16: 'J', 17: 'K', 18: 'KJ', 19: 'L', 20: 'LJ', 21: 'M', 22: 'MI', 23: 'MK', 24: 'N', 25: 'None', 26: 'O', 27: 'OI', 28: 'P', 29: 'Q', 30: 'R', 31: 'S'}
NORMAL_TARGET 25
main_labels Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'TSH',
       'T3_measured', 'T3', 'TT4_measured', 'TT4', 'T4U_measured', 'T4U',
       'FTI_measured', 'FTI', 'TBG_measured', 'TBG', 'referral_source',
       'target'],
      dtype='object')


In [22]:
for col in df.columns:
    print(col, df[col].unique())


age [40 35 77 73 74 60 66 42 29 44 78 28 58 46 97 57 24]
sex [0 1]
on_thyroxine [0 1]
query_on_thyroxine [0]
on_antithyroid_meds [0]
sick [0]
pregnant [0]
thyroid_surgery [0 1]
I131_treatment [0]
query_hypothyroid [0]
query_hyperthyroid [0]
lithium [0]
goitre [0]
tumor [0 1]
hypopituitary [0]
psych [0]
TSH_measured [1]
TSH [7.0e+01 1.4e-01 9.0e-02 1.1e+00 1.8e+00 1.4e+00 8.5e-02 1.2e+00 2.1e+00
 9.0e-01 1.0e+00 3.3e+00 1.5e+00 2.2e+00 2.3e+00 5.9e-01 2.7e+01 6.5e-02]
T3_measured [1]
T3 [0.4 1.9 1.8 1.6 1.1 1.4 2.5 2.1 1.5 2.4 2.  0.9 1.2 2.6]
TT4_measured [1]
TT4 [  3.9  73.  120.   89.  131.  116.   92.  138.  106.  122.   97.  135.
  75.  161.   93.  110.  139.  176. ]
T4U_measured [1]
T4U [0.83 1.16 0.96 0.74 1.04 0.81 0.84 0.8  0.98 1.14 0.85 1.01 0.82 1.27
 0.89 0.88 1.09 1.07]
FTI_measured [1]
FTI [  5.  63. 124. 119. 126. 143. 110. 173. 108. 107. 114. 135. 125.  92.
 127. 145. 104. 128. 164.]
TBG_measured [1]
TBG [28. 37. 45. 24. 25. 22. 21. 15. 27. 36. 30. 32. 35. 86. 29. 26.]


In [23]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
167,40,0,0,0,0,0,0,0,0,0,...,1,3.9,1,0.83,1,5.0,1,28.0,0,0
5256,35,0,0,0,0,0,0,1,0,0,...,1,73.0,1,1.16,1,63.0,1,37.0,0,5
6044,77,0,0,0,0,0,0,0,0,0,...,1,120.0,1,0.96,1,124.0,1,45.0,1,5
6045,73,1,0,0,0,0,0,0,0,0,...,1,89.0,1,0.74,1,119.0,1,24.0,1,5
6747,77,0,0,0,0,0,0,0,0,0,...,1,131.0,1,1.04,1,126.0,1,25.0,1,3


In [38]:
null_columns = df.isnull().any()
print("Columns with null values:")
print(null_columns[null_columns].index.tolist())

Columns with null values:
['sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG', 'referral_source']


In [41]:
df.shape

(9172, 30)