In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import time
import shap
import xgboost
from sklearn.preprocessing import MinMaxScaler


# testdaten für die presi holen
import os
os.getcwd()
os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

df = pd.read_csv('data/aidsenc.csv', sep =",")

In [2]:
df.shape

(2139, 18)

In [3]:
df.columns.shape

(18,)

In [4]:
df.columns

Index(['Unnamed: 0', 'age', 'wtkg', 'karnof', 'cd80', 'cid', 'trt_1', 'trt_2',
       'trt_3', 'hemo_1', 'homo_1', 'drugs_1', 'z30_1', 'race_1', 'gender_1',
       'strat_2', 'strat_3', 'symptom_1'],
      dtype='object')

## Datenspliting

In [3]:
X=df.drop(['cid'],axis=1)
y=df['cid']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=52)

print('X Train: {}'.format(X_train.shape)) 
print('Y Train: {}'.format(y_train.shape)) 
print('X Test: {}'.format(X_test.shape)) 
print('Y Test: {}'.format(y_test.shape))


X Train: (1497, 17)
Y Train: (1497,)
X Test: (642, 17)
Y Test: (642,)


## Verleich Trainings- und Testdaten

In [4]:
import numpy as np
import scipy.stats
from scipy.stats import wilcoxon

In [5]:
numset = df[['age', 'wtkg', 'karnof', 'cd80']]
#dfnum = pd.df.select(numset, axis=1)


In [9]:
numset.head()

Unnamed: 0,age,wtkg,karnof,cd80
0,48,89.8128,100,566
1,61,49.4424,90,392
2,45,88.452,90,2063
3,47,85.2768,100,1590
4,43,66.6792,100,870


In [6]:
def calculate_mean(dataframe, feat):
        return np.mean(dataframe[feat].to_numpy())



for feature in numset.columns:
    print("Feature: {}, mean train: {}, mean test: {}".format(feature, calculate_mean(X_train, feature), calculate_mean(X_test, feature)))


Feature: age, mean train: 35.30995323981296, mean test: 35.10436137071651
Feature: wtkg, mean train: 75.24372622578491, mean test: 74.84919165109035
Feature: karnof, mean train: 95.4442217768871, mean test: 95.45171339563863
Feature: cd80, mean train: 984.3974615898463, mean test: 991.8271028037383


## Transformierung

In [7]:
#from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train['karnof'] = scaler.fit_transform(X_train['karnof'].to_numpy().reshape(-1,1))
X_train['age'] = scaler.fit_transform(X_train['age'].to_numpy().reshape(-1,1))
X_train['wtkg'] = scaler.fit_transform(X_train['wtkg'].to_numpy().reshape(-1,1))
X_train['cd80'] = scaler.fit_transform(X_train['cd80'].to_numpy().reshape(-1,1))

X_test['karnof'] = scaler.fit_transform(X_test['karnof'].to_numpy().reshape(-1,1))
X_test['age'] = scaler.fit_transform(X_test['age'].to_numpy().reshape(-1,1))
X_test['wtkg'] = scaler.fit_transform(X_test['wtkg'].to_numpy().reshape(-1,1))
X_test['cd80'] = scaler.fit_transform(X_test['cd80'].to_numpy().reshape(-1,1))


In [8]:
def calculate_mean(dataframe, feat):
        return np.mean(dataframe[feat].to_numpy())



for feature in numset.columns:
    print("Feature: {}, mean train: {}, mean test: {}".format(feature, calculate_mean(X_train, feature), calculate_mean(X_test, feature)))


Feature: age, mean train: 0.4018957455140166, mean test: 0.39835105811580196
Feature: wtkg, mean train: 0.34313592238851587, mean test: 0.41276409118591406
Feature: karnof, mean train: 0.8481407258962368, mean test: 0.8483904465212874
Feature: cd80, mean train: 0.24937878573801067, mean test: 0.17538512572912152


## Support Vector Machine

In [13]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(X_train, y_train) #training

#Predict the response for test dataset
y_pred = clf.predict(X_test)

from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # how often is the classifier correct?


Accuracy: 0.7414330218068536


## Random Forest

## Logistische Regression

In [15]:
from sklearn.linear_model import LogisticRegression
# training
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

#prediction
y_pred = logreg.predict(X_test)
print("Accuracy Score: " + str(logreg.score(X_test,y_test)))

Accuracy Score: 0.7398753894080997


## Evaluation

In [16]:

def acu(y_pred,y_test):
    return metrics.accuracy_score(y_test, y_pred)

def train_evaluate_time(method):
    start_time = time.time()
    methods[method].fit(X_train, y_train)
    running_time = round(time.time() - start_time,2)
    model_predict = methods[method].predict(X_test)
    model_acu = acu(model_predict,y_test)
    print("Genauigkeit von {}: {}; running time: {}".format(method, model_acu, running_time))


methods = {"Log Regression": LogisticRegression(), "svm": svm.SVC()}


for method in methods:
    train_evaluate_time(method)

Genauigkeit von Log Regression: 0.7398753894080997; running time: 0.01
Genauigkeit von svm: 0.7398753894080997; running time: 0.03


## Interpretation

In [None]:
import matplotlib as plt
# Shap value nehmen
explainer = shap.KernelExplainer(methods["svm"].predict, X_train)
shap_values = explainer.shap_values(X_test)
#shap_sample = explainer.shap.sample(X_test, 150)
plt.figure(figsize=(10,10))
plt.title("SHAP Values: {}".format(method))
shap.summary_plot(shap_sample, X_train)
plt.show()

Using 1497 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
  0%|          | 3/642 [09:26<33:37:38, 189.45s/it]