In [12]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

sys.path.append(os.path.abspath(".."))
from common import common

In [13]:
base_path = common.base_path

In [14]:
def get_hepatitis_df():
    config = {
        'TARGET_COLUMN': 'Category',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['Age', 'ALB', 'ALP', 'ALT', 'AST','BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': ['Sex'],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/hepatitis/HepatitisCdata.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [32]:
def get_processed_hepatitis_df():
    all_df, main_labels, config = get_hepatitis_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Drop column
    all_df.drop(['Unnamed: 0'],axis=1,inplace=True)

    # Fix missing values
    all_df[c['NUMERICAL_COLUMNS']] = all_df[c['NUMERICAL_COLUMNS']].fillna(all_df[c['NUMERICAL_COLUMNS']].median())
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])

    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['0=Blood Donor']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [37]:
get_processed_hepatitis_df()

Normal class:  0    0=Blood Donor
Name: Category, dtype: object
TARGET_DICT {0: '0=Blood Donor', 1: '0s=suspect Blood Donor', 2: '1=Hepatitis', 3: '2=Fibrosis', 4: '3=Cirrhosis'}
NORMAL_TARGET 0
main_labels Index(['Category', 'Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL',
       'CREA', 'GGT', 'PROT', 'Sex_f', 'Sex_m'],
      dtype='object')


(     Category  Age   ALB    ALP    ALT    AST   BIL    CHE  CHOL   CREA  \
 0           0   32  38.5   52.5    7.7   22.1   7.5   6.93  3.23  106.0   
 1           0   32  38.5   70.3   18.0   24.7   3.9  11.17  4.80   74.0   
 2           0   32  46.9   74.7   36.2   52.6   6.1   8.84  5.20   86.0   
 3           0   32  43.2   52.0   30.6   22.6  18.9   7.33  4.74   80.0   
 4           0   32  39.2   74.1   32.6   24.8   9.6   9.15  4.32   76.0   
 ..        ...  ...   ...    ...    ...    ...   ...    ...   ...    ...   
 610         4   62  32.0  416.6    5.9  110.3  50.0   5.57  6.30   55.7   
 611         4   64  24.0  102.8    2.9   44.4  20.0   1.54  3.02   63.0   
 612         4   64  29.0   87.3    3.5   99.0  48.0   1.66  3.63   66.7   
 613         4   46  33.0   66.2   39.0   62.0  20.0   3.56  4.20   52.0   
 614         4   59  36.0   66.2  100.0   80.0  12.0   9.07  5.30   67.0   
 
        GGT  PROT  Sex_f  Sex_m  
 0     12.1  69.0    0.0    1.0  
 1     15.6  76.5 