In [1]:
import time
import numpy as np
import pandas as pd
from os.path import join
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def read_data():
    folder = 'data'
    df = pd.read_excel(join(folder, '10.25.14 us and uk 900.xlsx'))
    # move class to first column
    df = df[['[Dx] Diagnosis Chosen'] + [c for c in df if c not in ['[Dx] Diagnosis Chosen']]]  
    df['[Dx] Diagnosis Chosen'].fillna('None', inplace=True)  # fill class column Nan with 'None'
    df = df.drop('[Initial_Dx] ', 1)  # drop Initial_Dx column
    x = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    return x, y

def get_discrete_features(x):
    features = list(x.columns.values)
    discrete_features = []
    for f in features:
        if x[f].dtype == np.object:
            discrete_features.append(f)
    return discrete_features

In [2]:
def preprocess(x, y):  
    discrete_features = get_discrete_features(x)
    x[discrete_features] = x[discrete_features].astype(str)  # convert dtype 'object' to 'str'
    # one hot encoding for categorical feature values
    x_dummies = pd.get_dummies(x, columns=discrete_features, dummy_na=True)  
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')  # convert NaN to column mean value
    x_new = imp_mean.fit_transform(x_dummies)  
    return x_new, y
start = time.time()
x, y = read_data()
x_new, y_new = preprocess(x, y)
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)  # n_estimators: # trees, n_jobs -1 all cpu cores
accuracy = cross_val_score(rf, x_new, y_new, cv=5, scoring='accuracy')  # k-fold cross validation, k=cv
acc = accuracy.mean()
print('accuracy: ', acc)
print('--- running time: %.4f seconds ---' % (time.time() - start))

accuracy:  0.8318458880707228
--- running time: 8.6913 seconds ---


In [3]:
classes = np.unique(y)
classes

array(['None', 'ankylosing', 'psoriatic', 'rheumatoid', 'sjogren',
       'still', 'systemic'], dtype=object)

### preprocess dataset

In [4]:
# try different preprocessing
def preprocess2(x):
    dis_f = get_discrete_features(x)
    binary_f = []
    for f in dis_f:
        if len(x[f].unique()) == 2:
            binary_f.append(f)
    #for f in binary_f:
        #print(x[f].unique())

    cleanup_nums = {'[Gender] ':  {"Male": 1, "Female": 0}}  # binary transform
    x.replace(cleanup_nums, inplace=True)
    b_to_nums = {'Yes': 1, 'No': 0}  # binary transform
    nums = {f:b_to_nums for f in binary_f[1:]}
    x.replace(nums, inplace=True)
    non_binary = [i for i in dis_f if i not in binary_f]
    # convert dtype 'object' to 'str', necessary for get_dummies function
    x[non_binary] = x[non_binary].astype(str) 
    x_dummies = pd.get_dummies(x, columns=non_binary, dummy_na=True)  # one hot encoding
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')  # convert NaN to column mean value
    x_new = imp_mean.fit_transform(x_dummies)
    return x_new

start = time.time()
x, y = read_data()
x_new = preprocess2(x)
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
accuracy = cross_val_score(rf, x_new, y, cv=5, scoring='accuracy')
acc = accuracy.mean()
print('accuracy: ', acc)
print('--- running time: %.4f seconds ---' % (time.time() - start))

accuracy:  0.8341065846540815
--- running time: 8.6634 seconds ---


In [5]:
freq = y.value_counts()
freq

systemic      241
rheumatoid    199
ankylosing    157
sjogren       125
psoriatic      93
still          66
None           19
Name: [Dx] Diagnosis Chosen, dtype: int64