In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd


# testdaten für die presi holen
import os
os.getcwd()
os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

aidspresi = pd.read_csv('data/aids.csv', sep =",")

In [2]:
aidspresi.shape

(2139, 25)

In [3]:
aidspresi = aidspresi.drop(["num","time","oprior","z30","preanti","str2","treat","offtrt","cd40","cd420","cd820"],axis=1)

In [4]:
aidspresi.columns.shape

(14,)

In [5]:
aidspresi.columns

Index(['trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof', 'zprior',
       'race', 'gender', 'strat', 'symptom', 'cd80', 'cid'],
      dtype='object')

## Datenspliting

In [6]:
X=aidspresi.drop(['cid'],axis=1)
y=aidspresi['cid']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=52)

print('X Train: {}'.format(X_train.shape)) 
print('Y Train: {}'.format(y_train.shape)) 
print('X Test: {}'.format(X_test.shape)) 
print('Y Test: {}'.format(y_test.shape))


X Train: (1497, 13)
Y Train: (1497,)
X Test: (642, 13)
Y Test: (642,)


## Verleich Trainings- und Testdaten

In [7]:
import numpy as np
import scipy.stats
from scipy.stats import wilcoxon

In [8]:
notsteti = ['trt', 'hemo', 'homo', 'drugs', 'karnof', 'zprior','race', 'gender', 'strat', 'symptom', 'cid']
aidsnume = aidspresi.drop(notsteti, axis=1)


In [9]:
def calculate_mean(dataframe, feat):
        return np.mean(dataframe[feat].to_numpy())



for feature in aidsnume.columns:
    print("Feature: {}, mean train: {}, mean test: {}".format(feature, calculate_mean(X_train, feature), calculate_mean(X_test, feature)))


Feature: age, mean train: 35.30995323981296, mean test: 35.10436137071651
Feature: wtkg, mean train: 75.24372622578491, mean test: 74.84919165109035
Feature: cd80, mean train: 984.3974615898463, mean test: 991.8271028037383


## Transformierung

In [10]:
from sklearn.preprocessing import StandardScaler
# standardisierung von `age`, `wtkg` und `cd80`

scaler = StandardScaler()
X_train['age'] = scaler.fit_transform(X_train['age'].to_numpy().reshape(-1,1))
X_train['wtkg'] = scaler.fit_transform(X_train['wtkg'].to_numpy().reshape(-1,1))
X_train['cd80'] = scaler.fit_transform(X_train['cd80'].to_numpy().reshape(-1,1))

X_test['age'] = scaler.fit_transform(X_test['age'].to_numpy().reshape(-1,1))
X_test['wtkg'] = scaler.fit_transform(X_test['wtkg'].to_numpy().reshape(-1,1))
X_test['cd80'] = scaler.fit_transform(X_test['cd80'].to_numpy().reshape(-1,1))


In [11]:
def calculate_mean(dataframe, feat):
        return np.mean(dataframe[feat].to_numpy())



for feature in aidsnume.columns:
    print("Feature: {}, mean train: {}, mean test: {}".format(feature, calculate_mean(X_train, feature), calculate_mean(X_test, feature)))


Feature: age, mean train: -3.7971555685242497e-16, mean test: 1.7984921271186335e-17
Feature: wtkg, mean train: -6.099181131942076e-16, mean test: -6.446902547979102e-16
Feature: cd80, mean train: 4.805775016413503e-17, mean test: 6.087204122555375e-17


## Support Vector Machine

In [12]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(X_train, y_train) #training

#Predict the response for test dataset
y_pred = clf.predict(X_test)

from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # how often is the classifier correct?


Accuracy: 0.7398753894080997


In [13]:
import time
import shap
import xgboost

## Logistische Regression

In [14]:
from sklearn.linear_model import LogisticRegression
# training
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

#prädiction
y_pred = logreg.predict(X_test)
print("Accuracy Score: " + str(logreg.score(X_test,y_test)))

Accuracy Score: 0.7429906542056075


## Evaluation

In [15]:

def acu(y_pred,y_test):
    return metrics.accuracy_score(y_test, y_pred)

def train_evaluate_time(method):
    start_time = time.time()
    methods[method].fit(X_train, y_train)
    running_time = round(time.time() - start_time,2)
    model_predict = methods[method].predict(X_test)
    model_acu = acu(model_predict,y_test)
    print("Genauigkeit von {}: {}; running time: {}".format(method, model_acu, running_time))


methods = {"Log Regression": LogisticRegression(), "svm": svm.SVC()}


for method in methods:
    train_evaluate_time(method)

Genauigkeit von Log Regression: 0.7429906542056075; running time: 0.02
Genauigkeit von svm: 0.7398753894080997; running time: 0.05


## Interpretation

In [None]:
import matplotlib as plt
# Shap value nehmen
explainer = shap.KernelExplainer(methods["svm"].predict, X_train)
shap_values = explainer.shap_values(X_test)
#shap_sample = explainer.shap.sample(X_test, 150)
plt.figure(figsize=(10,10))
plt.title("SHAP Values: {}".format(method))
shap.summary_plot(shap_sample, X_train)
plt.show()

Using 1497 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
  2%|▏         | 11/642 [1:13:02<151:22:39, 863.64s/it]