In [29]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

sys.path.append(os.path.abspath(".."))
from common import common

In [30]:
base_path = '/Users/suyeetan/Downloads/CS5344_Project/work/'

In [31]:
def get_thyroid_df():
    config = {
        'TARGET_COLUMN': 'target',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/thyroid/thyroidDF.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [39]:
def map_column_values(df):
    df_mapped = df.copy()
    columns = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick', 'pregnant', 
               'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 
               'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 
               'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source']
    for col in columns:
        if col == 'sex':
            df_mapped['sex'] = df_mapped['sex'].map({'M': 1, 'F': 0})
        elif col == 'referral_source':
            df_mapped['referral_source'] = df_mapped['referral_source'].map({'SVI': 1, 'other': 0})
        else:
            df_mapped[col] = df_mapped[col].map({'t': 1, 'f': 0})
    return df_mapped

def fill_na(df):
    df_filled = df.copy()
    df_filled['sex'] = df_filled['sex'].fillna(0)
    df_filled['referral_source'] = df_filled['referral_source'].fillna(0)
    df_filled['TSH'] = df_filled['TSH'].fillna(df_filled['TSH'].mean())
    df_filled['T3'] = df_filled['T3'].fillna(df_filled['T3'].mean())
    df_filled['TT4'] = df_filled['TT4'].fillna(df_filled['TT4'].mean())
    df_filled['T4U'] = df_filled['T4U'].fillna(df_filled['T4U'].mean())
    df_filled['FTI'] = df_filled['FTI'].fillna(df_filled['FTI'].mean())
    df_filled['TBG'] = df_filled['TBG'].fillna(df_filled['TBG'].mean())
    return df_filled

def get_processed_thyroid_df():
    all_df, main_labels, config = get_thyroid_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Drop column
    # all_df.dropna(inplace=True)
    all_df = all_df.drop(columns='patient_id')

    # Map values
    all_df[target_column] = all_df[target_column].apply(lambda x:"None" if x == "-" else x)
    all_df = map_column_values(all_df)
    all_df = fill_na(all_df)
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])

    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['None']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    # ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)