## Pakete

In [None]:
!pip install scikit-learn

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Allgemeine Information zum Datensatz

In [2]:
# df holen
#import pandas as pd
aids = pd.read_csv('data/aids.csv', sep =",")

In [4]:
aids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   num      2139 non-null   int64  
 1   time     2139 non-null   int64  
 2   trt      2139 non-null   int64  
 3   age      2139 non-null   int64  
 4   wtkg     2139 non-null   float64
 5   hemo     2139 non-null   int64  
 6   homo     2139 non-null   int64  
 7   drugs    2139 non-null   int64  
 8   karnof   2139 non-null   int64  
 9   oprior   2139 non-null   int64  
 10  z30      2139 non-null   int64  
 11  zprior   2139 non-null   int64  
 12  preanti  2139 non-null   int64  
 13  race     2139 non-null   int64  
 14  gender   2139 non-null   int64  
 15  str2     2139 non-null   int64  
 16  strat    2139 non-null   int64  
 17  symptom  2139 non-null   int64  
 18  treat    2139 non-null   int64  
 19  offtrt   2139 non-null   int64  
 20  cd40     2139 non-null   int64  
 21  cd420    2139 

In [5]:
aids.head()

Unnamed: 0,num,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,cid
0,0,948,2,48,89.8128,0,0,0,100,0,...,0,1,0,1,0,422,477,566,324,0
1,1,1002,3,61,49.4424,0,0,0,90,0,...,1,3,0,1,0,162,218,392,564,1
2,2,961,3,45,88.452,0,1,1,90,0,...,1,3,0,1,1,326,274,2063,1893,0
3,3,1166,3,47,85.2768,0,1,0,100,0,...,1,3,0,1,0,287,394,1590,966,0
4,4,1090,0,43,66.6792,0,1,0,100,0,...,1,3,0,0,0,504,353,870,782,0


## Dictionary für das Labeling erstellen

Die Beschreibung einzelner Features befindet sich im `README`. Es werden Labels für nicht kontinuerliche Features erstellt. Die Information in den Features liegt numerisch vor. Es muss kein Replacement in den Daten erfolgen.

In [11]:
# dict erstellen
#treatment `trt`
ctrt = [0,1,2,3]
trt = ['ZDV only', 'ZDV + ddI', 'ZDV + Zal', 'ddI only']
trtzip = zip(ctrt, trt)
print("trt", dict(trtzip))

#hemo, Hemophilie, 0=no, 1=yes
chemo = [0,1]
hemo = ['no hemophilia', 'hemophilia']
hemozip = zip(chemo, hemo)
print("hemo", dict(hemozip))

#homo, homo-sexuelle Aktivität, 0=no, 1=yes
chomo = [0,1]
homo = ['no homosexual', 'homosexual']
homozip = zip(chomo, homo)
print("homo", dict(homozip))

#drugs, Gebrauch von intravenösen Drogen, 0=no, 1=yes
cdrugs = [0,1]
drugs = ['no IV drugs use', 'IV drugs use']
drugszip = zip(cdrugs, drugs)
print("drugs", dict(drugszip))

#oprior, vorangegangene Antiretrovirale Therapie, keine ZDV (Zidovudine), 0=no, 1=yes
coprior = [0,1]
oprior = ['no prior antiretroviral therapy (no ZDV)', 'prior antiretroviral therapy (no ZDV)']
opriorzip = zip(coprior, oprior)
print("oprior", dict(opriorzip))

#z30, ZDV-Therapie binnen 30 Tage vor 175-Randomisierung, 0=no, 1=yes
cz30 = [0,1]
z30 = ['no ZDV-Therapy 30 days befor randomisation', 'ZDV-Therapy 30 days befor randomisation']
z30zip = zip(cz30, z30)
print("z30", dict(z30zip))

#zprior, ZDV-Therapie vor 175-Randomisierung, 0=no, 1=yes
czprior = [0,1]
zprior = ['no ZDV-Therapy befor randomisation', 'ZDV-Therapy befor randomisation']
zpriorzip = zip(czprior, zprior)
print("zprior", dict(zpriorzip))

#race, Ethnie, 0=White, 1=non-white
crace = [0,1]
race = ['White', 'Non-White']
racezip = zip(crace, race)
print("race", dict(racezip))

#gender, Geschlecht
cgender = [0,1]
gender = ['Female', 'Male']
genderzip = zip(cgender, gender)
print("gender", dict(genderzip))

#str2, antiretrovirale Historie, 0=naive, 1=experienced
cstr2 = [0,1]
str2 = ['naive', 'experienced']
str2zip = zip(cstr2, str2)
print("str2", dict(str2zip))

#strat, antiretrovirale Historie Stratifizierung, 1='Antiretroviral Naive',2='> 1 but <= 52 weeks of prior antiretroviral therapy',3='> 52 weeks
cstrat = [1,2,3]
strat = ['Antiretroviral Naive', '> 1 but <= 52 weeks of prior antiretroviral therapy', '> 52 weeks']
stratzip = zip(cstrat, strat)
print("strat", dict(stratzip))

#symptom, (virale) Symptomatik, 0=asymp, 1=symp
csymptom = [0,1]
symptom = ['asymptomatic', 'symtomatic']
symptomzip = zip(csymptom, symptom)
print("symptom", dict(symptomzip))

#treat, Treatment Indikatior, 0=ZDV only, 1=others
ctreat = [0,1]
treat = ['ZDV only', 'others']
treatzip = zip(ctreat, treat)
print("treat", dict(treatzip))

#cid, target, 1 = failure, 0 = censoring
ccid = [0,1]
cid = ['censoring', 'failure']
cidzip = zip(ccid, cid)
print("cid", dict(cidzip))


trt {0: 'ZDV only', 1: 'ZDV + ddI', 2: 'ZDV + Zal', 3: 'ddI only'}
hemo {0: 'no hemophilia', 1: 'hemophilia'}
homo {0: 'no homosexual', 1: 'homosexual'}
drugs {0: 'no IV drugs use', 1: 'IV drugs use'}
oprior {0: 'no prior antiretroviral therapy (no ZDV)', 1: 'prior antiretroviral therapy (no ZDV)'}
z30 {0: 'no ZDV-Therapy 30 days befor randomisation', 1: 'ZDV-Therapy 30 days befor randomisation'}
zprior {0: 'no ZDV-Therapy befor randomisation', 1: 'ZDV-Therapy befor randomisation'}
race {0: 'White', 1: 'Non-White'}
gender {0: 'Female', 1: 'Male'}
str2 {0: 'naive', 1: 'experienced'}
strat {1: 'Antiretroviral Naive', 2: '> 1 but <= 52 weeks of prior antiretroviral therapy', 3: '> 52 weeks'}
symptom {0: 'asymptomatic', 1: 'symtomatic'}
treat {0: 'ZDV only', 1: 'others'}
cid {0: 'censoring', 1: 'failure'}


In [None]:
from scipy.stats import chisquare

b = aids(["homo"])
#chisquare([a], [b])

In [28]:
aids.cid.value_counts()

cid
0    1618
1     521
Name: count, dtype: int64