In [9]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from fancyimpute import KNN

sys.path.append(os.path.abspath(".."))
from common import common

In [4]:
base_path = '/Users/suyeetan/Downloads/CS5344_Project/work/'

In [96]:
def get_thyroid_df():
    config = {
        'TARGET_COLUMN': 'target',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['age', 'TT4', 'T3', 'T4U', 'FTI', 'TSH'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/thyroid/thyroidDF.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [95]:
def rolette_fill(df, column, val1, val2):
    threshold = df[column].value_counts()[val1] / (df.shape[0] - df[column].isnull().sum())

    for i in df.index:
        if pd.isna(df.loc[i, column]) or pd.isnull(df.loc[i, column]):
            rand_num = np.random.rand()
            if rand_num > threshold:
                df.loc[i, column] = val2
            else:
                df.loc[i, column] = val1
    return df
    
def get_processed_thyroid_df():
    mapping = {'-':"Negative",
           'A':'Hyperthyroid','AK':"Hyperthyroid",'B':"Hyperthyroid", 'C':"Hyperthyroid", 'C|I': 'Hyperthyroid', 'D':"Hyperthyroid", 
           'D|R':"Hyperthyroid",
           'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 
           'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid'}
           
    all_df, main_labels, config = get_thyroid_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']
    
    # Drop column
    all_df = all_df[all_df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
    all_df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','referral_source','patient_id'],axis=1 ,inplace=True)
    all_df = all_df.drop('TBG', axis=1)
    
    # Map and filter
    all_df['target'] = all_df['target'].map(mapping)
    all_df = all_df[all_df['age'] < 100]
    all_df = all_df.dropna(thresh=21)
    
    # Fill values
    all_df = rolette_fill(all_df.copy(), 'sex', 'F', 'M')
    columns = ['sex', 'age', 'TT4', 'T3', 'T4U', 'FTI', 'TSH']
    b_fill_df = all_df.loc[:, columns]
    b_fill_df = b_fill_df.fillna(np.nan)
    sex_to_num = {
        'F':0,
        'M':1,
    }
    b_fill_df['sex'] = b_fill_df['sex'].map(sex_to_num)
    knn = KNN(k=13)
    knn_imputed_df = knn.fit_transform(b_fill_df)
    knn_imputed_df = pd.DataFrame(knn_imputed_df, index=b_fill_df.index)
    knn_imputed_df = knn_imputed_df.rename(columns=dict(zip(knn_imputed_df.columns,columns)))
    all_df.update(knn_imputed_df)
    
    columns = ['age', 'TT4', 'T3', 'T4U', 'FTI', 'TSH', 'pregnant', 'target']
    model_df = all_df.loc[:, columns]
    model_df['pregnant'] = model_df['pregnant'].replace({'t': 1, 'f': 0})
    target_to_num = {
        'Negative': 0,
        'Hypothyroid':1,
        'Hyperthyroid':2,
    }
    model_df['target'] = model_df['target'].map(target_to_num)

    # Label Encoder
    # le, all_df = common.label_encode(all_df, [target_column])

    config['INV_TARGET_DICT'] = target_to_num
    config['TARGET_DICT'] = {v: k for k, v in config['INV_TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = target_to_num['Negative']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    # ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = model_df.columns
    # print('main_labels', main_labels)
    
    return (model_df, main_labels, config)

In [93]:


# df1,m,c= get_thyroid_df()
# df = df1.copy()
# df = df[df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
# df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','referral_source','patient_id'],axis=1 ,inplace=True)
# mapping = {'-':"Negative",
#            'A':'Hyperthyroid','AK':"Hyperthyroid",'B':"Hyperthyroid", 'C':"Hyperthyroid", 'C|I': 'Hyperthyroid', 'D':"Hyperthyroid", 
#            'D|R':"Hyperthyroid",
#            'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 
#            'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid',
#           }
# # '-', 'S', 'F', 'AK', 'R', 'I', 'M', 'N', 'G', 'K', 'A', 'KJ', 'L',
# #        'MK', 'Q', 'J', 'C|I', 'O', 'LJ', 'H|K', 'D', 'GK', 'MI', 'P',
# #        'FK', 'B', 'GI', 'C', 'GKJ', 'OI', 'D|R', 'E']
# df['target'] = df['target'].map(mapping)
# df = df[df['age'] < 100]
# df = df.drop('TBG', axis=1)
# df = df.dropna(thresh=21)


# df = rolette_fill(df.copy(), 'sex', 'F', 'M')
# columns = ['sex', 'age', 'TT4', 'T3', 'T4U', 'FTI', 'TSH']
# b_fill_df = df.loc[:, columns]
# b_fill_df = b_fill_df.fillna(np.nan)
# sex_to_num = {
#     'F':0,
#     'M':1,
# }
# b_fill_df['sex'] = b_fill_df['sex'].map(sex_to_num)
# knn = KNN(k=13)
# knn_imputed_df = knn.fit_transform(b_fill_df)
# knn_imputed_df = pd.DataFrame(knn_imputed_df, index=b_fill_df.index)
# knn_imputed_df = knn_imputed_df.rename(columns=dict(zip(knn_imputed_df.columns,columns)))
# df.update(knn_imputed_df)

Normal class:  0    -
Name: target, dtype: object
Imputing row 1/6652 with 1 missing, elapsed time: 3.953
Imputing row 101/6652 with 0 missing, elapsed time: 3.956
Imputing row 201/6652 with 1 missing, elapsed time: 3.957
Imputing row 301/6652 with 1 missing, elapsed time: 3.958
Imputing row 401/6652 with 0 missing, elapsed time: 3.959
Imputing row 501/6652 with 0 missing, elapsed time: 3.960
Imputing row 601/6652 with 1 missing, elapsed time: 3.960
Imputing row 701/6652 with 0 missing, elapsed time: 3.961
Imputing row 801/6652 with 0 missing, elapsed time: 3.962
Imputing row 901/6652 with 1 missing, elapsed time: 3.963
Imputing row 1001/6652 with 0 missing, elapsed time: 3.964
Imputing row 1101/6652 with 0 missing, elapsed time: 3.964
Imputing row 1201/6652 with 1 missing, elapsed time: 3.965
Imputing row 1301/6652 with 0 missing, elapsed time: 3.965
Imputing row 1401/6652 with 0 missing, elapsed time: 3.966
Imputing row 1501/6652 with 0 missing, elapsed time: 3.967
Imputing row 1601/

In [70]:
# def get_null(df):
#     null_cols = df.columns[df.isnull().any()]
#     nulls_df = df.loc[:, null_cols]
#     nulls_df = pd.DataFrame(nulls_df.isna().sum()/df.shape[0]*100)
#     nulls_df = nulls_df.rename(columns={0: 'Missing Values %'})
#     return nulls_df

# # get_null(knn_imputed_df).head()
# # knn_imputed_df.isnull().any()
# # knn_imputed_df.loc[:, df.isna().any()]
# # df.isnull().any()
# df1['target'].unique()
# # knn_imputed_df.isnull().any()
# model_df['target'].unique()

array([1, 0, 2])

In [64]:
# columns = ['age', 'TT4', 'T3', 'T4U', 'FTI', 'TSH', 'pregnant', 'target']
# model_df = df.loc[:, columns]
# model_df['pregnant'] = model_df['pregnant'].replace({'t': 1, 'f': 0})
# target_to_num = {
#     'Negative': 0,
#     'Hypothyroid':1,
#     'Hyperthyroid':2,
# }
# model_df['target'] = model_df['target'].map(target_to_num)
# x = model_df.iloc[:, :-1]
# y = model_df.iloc[:,-1]



  model_df['pregnant'] = model_df['pregnant'].replace({'t': 1, 'f': 0})


In [74]:
# all_df,m,c = get_thyroid_df()
# x = all_df.iloc[:, :-1]
# y = all_df.iloc[:,-1]

# x_smot, y_smot = SMOTE().fit_resample(x, y)


# x_train, x_test, y_train, y_test = train_test_split(x_smot, y_smot, test_size=0.33, random_state=89)

# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(x_train, y_train)
# knn_predictions = knn.predict(x_test)
# cr = classification_report(y_test, knn_predictions, output_dict=True)

Normal class:  0    -
Name: target, dtype: object


ValueError: could not convert string to float: 'F'

In [67]:
# cr

{'0': {'precision': 0.9748743718592965,
  'recall': 0.9122257053291536,
  'f1-score': 0.9425101214574899,
  'support': 1914.0},
 '1': {'precision': 0.9655347187024835,
  'recall': 0.98858329008822,
  'f1-score': 0.9769230769230769,
  'support': 1927.0},
 '2': {'precision': 0.9499248873309964,
  'recall': 0.9880208333333333,
  'f1-score': 0.9685984171559867,
  'support': 1920.0},
 'accuracy': 0.9630272522131574,
 'macro avg': {'precision': 0.9634446592975922,
  'recall': 0.9629432762502356,
  'f1-score': 0.9626772051788511,
  'support': 5761.0},
 'weighted avg': {'precision': 0.9634352949755064,
  'recall': 0.9630272522131574,
  'f1-score': 0.9627155185974482,
  'support': 5761.0}}