In [105]:
import time
import numpy as np
import pandas as pd
from os.path import join
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def read_data():
    folder = 'data'
    df = pd.read_csv(join(folder, '10.25.14 us and uk 900 - Values Transformed - No Date No Post.csv'))
    # move class to first column
    df = df[['Dx_Chosen'] + [c for c in df if c not in ['Dx_Chosen']]]  
    df['Onset_Age'].fillna(df['Onset_Age'].mean(), inplace=True)
    df['Dx_Chosen'].fillna('None', inplace=True)  # fill class column Nan with 'None'
    # drop Initial_Dx column, drop the last 3 columns as they only contain 1 unique value
    df = df.drop(['Initial_Dx','Dx_Change','New_Dx','Part3_5_FS_ARDS',
                 'Part3_5_FS_lung','Part3_5_still_ARDS_24p'], 1)  
    x = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    return x, y

def get_discrete_features(x):
    features = list(x.columns.values)
    discrete_features = []
    for f in features:
        if x[f].dtype == np.object:
            discrete_features.append(f)
    return discrete_features

start = time.time()
x, y = read_data()
dis_f = get_discrete_features(x)

x_dummies = pd.get_dummies(x, columns=dis_f, dummy_na=True)  # one hot encoding
#x_dummies.isnull().sum()

x_train, x_test, y_train, y_test = train_test_split(x_dummies, y, test_size=0.2, random_state=0)
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_test, y_pred))
print('--- running time: %.4f seconds ---' % (time.time() - start))

              precision    recall  f1-score   support

  ankylosing       0.85      0.94      0.89        35
   psoriatic       0.77      0.71      0.74        14
  rheumatoid       0.84      0.82      0.83        39
     sjogren       0.94      0.62      0.75        24
       still       1.00      0.82      0.90        17
    systemic       0.82      0.96      0.88        51

    accuracy                           0.85       180
   macro avg       0.87      0.81      0.83       180
weighted avg       0.86      0.85      0.85       180

--- running time: 0.6996 seconds ---


In [88]:
from sklearn.svm import SVC
start = time.time()
clf = SVC(gamma='auto')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))
print('--- running time: %.4f seconds ---' % (time.time() - start))

              precision    recall  f1-score   support

  ankylosing       0.83      0.69      0.75        35
   psoriatic       0.00      0.00      0.00        14
  rheumatoid       0.36      0.62      0.45        39
     sjogren       0.00      0.00      0.00        24
       still       0.00      0.00      0.00        17
    systemic       0.48      0.78      0.59        51

    accuracy                           0.49       180
   macro avg       0.28      0.35      0.30       180
weighted avg       0.37      0.49      0.41       180

--- running time: 2.1455 seconds ---


  'precision', 'predicted', average, warn_for)


In [104]:
from sklearn.neural_network import MLPClassifier
start = time.time()
mlp = MLPClassifier(hidden_layer_sizes=(60, 60, 60), random_state=0)
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
print(classification_report(y_test, y_pred))
print('--- running time: %.4f seconds ---' % (time.time() - start))

              precision    recall  f1-score   support

  ankylosing       0.86      0.89      0.87        35
   psoriatic       0.72      0.93      0.81        14
  rheumatoid       0.84      0.79      0.82        39
     sjogren       0.77      0.71      0.74        24
       still       1.00      0.82      0.90        17
    systemic       0.87      0.90      0.88        51

    accuracy                           0.84       180
   macro avg       0.84      0.84      0.84       180
weighted avg       0.85      0.84      0.84       180

--- running time: 1.5927 seconds ---


### feature importance

In [116]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
features = x_dummies.columns
for i in range(9):
    print("%2d) %-*s %f" % (i + 1, 30, features[indices[i]], importances[indices[i]]))

 1) Part3_6_Ank_nan                0.047970
 2) Part3_3_sys_nan                0.038582
 3) Part3_3_sys_other_Text_nan     0.019043
 4) Part3_3_sys_yes                0.014967
 5) Country_united_kingdom         0.013343
 6) Part3_1_rheu_nan               0.013260
 7) Ex5_30d_Ps_pre_arthritis_nan   0.012752
 8) Ex5_30c_Ps_Dx_yes              0.011491
 9) Part3_4_sjo_nan                0.011096


In [126]:
from sklearn.preprocessing import scale
#from sklearn.preprocessing import MinMaxScaler
"""
reference: https://stackoverflow.com/questions/35249760/
using-scikit-to-determine-contributions-of-each-feature-to-a-specific-class-pred/35255612
"""
def class_feature_importance(X, Y, feature_importances):
    N, M = X.shape
    X = scale(X)
    out = {}
    for c in set(Y):
        out[c] = np.mean(X[Y==c, :], axis=0)*feature_importances
    return out

result = class_feature_importance(x_dummies, y, importances)
result = pd.DataFrame(result)
result

Unnamed: 0,ankylosing,systemic,rheumatoid,psoriatic,still,sjogren
0,-0.004952,-0.001636,0.003424,0.001256,0.000314,0.002736
1,-0.000999,0.000449,0.000361,0.000010,-0.001139,0.000387
2,0.000488,-0.000219,-0.000176,-0.000005,0.000556,-0.000189
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,-0.005511,0.011030,-0.002661,-0.005747,-0.002217,-0.004928
...,...,...,...,...,...,...
2422,-0.000048,0.000004,-0.000166,0.000075,0.000427,0.000036
2423,0.000013,-0.000016,0.000016,-0.000008,-0.000121,0.000060
2424,-0.000005,0.000056,-0.000030,-0.000034,-0.000127,0.000038
2425,0.000090,-0.000087,0.000275,0.000024,-0.000234,-0.000274


In [129]:
ankylosing = result['ankylosing']
index = np.argsort(ankylosing)[::-1]
for i in range(9):
    print("%2d) %-*s %f" % (i + 1, 50, features[index[i]], importances[index[i]]))

 1) Part3_6_Ank_nan                                    0.047970
 2) Part3_3_sys_yes                                    0.014967
 3) Country_united_kingdom                             0.013343
 4) Onset_Age                                          0.008442
 5) Part3_6_Ank_other_Text_nan                         0.003761
 6) Ex5_30_Rash_24_yes,_i_first_noticed_rashes_between_0_and_<_12_months_from_initial_onset 0.004861
 7) Ex4_7_Sac_0_12_no                                  0.003647
 8) Ex5_12_Sw_24_yes,_i_noticed_swelling_in_the_locations_i_experienced_paintenderness_between_0_and_<_12_months_from_initial_onset 0.004729
 9) Ex5_30c_Ps_Dx_no                                   0.006057


In [130]:
systemic = result['systemic']
index = np.argsort(systemic)[::-1]
for i in range(9):
    print("%2d) %-*s %f" % (i + 1, 50, features[index[i]], importances[index[i]]))

 1) Part3_3_sys_nan                                    0.038582
 2) Part3_3_sys_other_Text_nan                         0.019043
 3) Country_united_states                              0.007155
 4) Ex5_30c_Ps_Dx_nan                                  0.008210
 5) Part3_6_Ank_yes,_i_was_told_for_the_first_time_prior_to_or_at_diagnosis_that_my_blood_tests_were_positive_for_the_hla-b27_gene. 0.010114
 6) Ex5_30c_Ps_Dx_yes                                  0.011491
 7) Ex5_30_Rash_butterfly_no                           0.004634
 8) Onset_Age                                          0.008442
 9) Ex5_24_Hair_Loss_24_no                             0.003313
