In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sp
import tabulate as tb
from statsmodels.miscmodels.ordinal_model import OrderedModel

SAHtrain = pd.read_csv('trainSAHdata.csv')
SAHtrain['evd']=SAHtrain['evd'].fillna(SAHtrain['evd'].value_counts().index[0])

SAHtest = pd.read_csv('testSAHdata.csv')


var = ['race', 'etiology', 'treatment','hh','disposition','gender','htn', 
          'dm', 'smoking', 'cad', 'dyslipidemia', 'shunt','evd']

#legend for the encoded variables
SAHtrain['gender'] = SAHtrain['gender'].replace([1,2],['female','male'])
SAHtrain['race'] = SAHtrain['race'].replace([1,2,3,4,5],['white','african american','asian','other','hispanic'])
SAHtrain['etiology'] = SAHtrain['etiology'].replace([1,2,21],['aneurysmal','angio negative','aneurysmal'])
SAHtrain['treatment'] = SAHtrain['treatment'].replace([1,2,3,4],['clip',
                                                                   'endovascular thrombectomy',
                                                                   'no treatment',
                                                                   'angio negative'
                                                                  ])
SAHtrain['disposition'] = SAHtrain['disposition'].replace([1,2,3,4,5,6,7],['good',
                                                                           'good',
                                                                          'bad',
                                                                           'bad',
                                                                           'bad',
                                                                           'bad',
                                                                          'bad'
                                                                          ])
SAHtest['gender'] = SAHtest['gender'].replace([1,2],['female','male'])
SAHtest['hh'] = SAHtest['hh'].replace([6],[5])
SAHtest['race'] = SAHtest['race'].replace([1,2,3,4,5],['white','african american','asian','other','hispanic'])
SAHtest['etiology'] = SAHtest['etiology'].replace([1,2,21],['aneurysmal','angio negative','aneurysmal'])
SAHtest['treatment'] = SAHtest['treatment'].replace([1,2,3,4],['clip',
                                                                   'endovascular thrombectomy',
                                                                   'no treatment',
                                                                   'angio negative'
                                                                  ])
SAHtest['disposition'] = SAHtest['disposition'].replace([1,2,3,4,5,6,7],['good',
                                                                           'good',
                                                                          'bad',
                                                                           'bad',
                                                                           'bad',
                                                                           'bad',
                                                                          'bad'
                                                                          ])

In [2]:
SAHtrain.info()
SAHtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     911 non-null    int64  
 1   record_id      911 non-null    int64  
 2   age            911 non-null    int64  
 3   race           911 non-null    object 
 4   gender         911 non-null    object 
 5   htn            911 non-null    float64
 6   dm             911 non-null    float64
 7   smoking        911 non-null    float64
 8   cad            911 non-null    float64
 9   dyslipidemia   911 non-null    float64
 10  treatment      911 non-null    object 
 11  shunt          911 non-null    float64
 12  hh             911 non-null    float64
 13  etiology       911 non-null    object 
 14  evd            911 non-null    float64
 15  disposition    911 non-null    object 
 16  imagepath_new  911 non-null    object 
dtypes: float64(8), int64(3), object(6)
memory usage: 121.1

In [3]:
#For all patients in training set give count and percentage of each variable

print('mean age =', SAHtrain.age.mean(),'std dev =', SAHtrain.age.std())
for x in var:
    counts = SAHtrain[x].value_counts()
    percs = SAHtrain[x].value_counts(normalize = True)
    print(x, pd.concat([counts,percs], axis=1, keys=['count', 'percentage']))

mean age = 54.298572996706916 std dev = 13.464678584726599
race                   count  percentage
white               401    0.440176
african american    293    0.321625
other               165    0.181120
hispanic             28    0.030735
asian                24    0.026345
etiology                 count  percentage
aneurysmal        647    0.710209
angio negative    264    0.289791
treatment                            count  percentage
endovascular thrombectomy    450    0.493963
angio negative               264    0.289791
clip                         126    0.138310
no treatment                  71    0.077936
hh      count  percentage
1.0    386    0.423710
3.0    324    0.355653
4.0    113    0.124040
5.0     66    0.072448
2.0     22    0.024149
disposition       count  percentage
good    651    0.714599
bad     260    0.285401
gender         count  percentage
female    602    0.660812
male      309    0.339188
htn      count  percentage
1.0    504    0.553238
0.0    407    

In [4]:
#For all patients in testing set give count and percentage of each variable

print('mean age =', SAHtest.age.mean(),'std dev =', SAHtest.age.std())
for x in var:
    counts = SAHtest[x].value_counts()
    percs = SAHtest[x].value_counts(normalize = True)
    print(x, pd.concat([counts,percs], axis=1, keys=['count', 'percentage']))

mean age = 54.8640350877193 std dev = 14.422935889810471
race                   count  percentage
white                88    0.385965
african american     86    0.377193
other                39    0.171053
hispanic              8    0.035088
asian                 7    0.030702
etiology                 count  percentage
aneurysmal        164    0.719298
angio negative     64    0.280702
treatment                            count  percentage
endovascular thrombectomy    101    0.442982
angio negative                63    0.276316
clip                          41    0.179825
no treatment                  23    0.100877
hh      count  percentage
3.0     91    0.399123
1.0     84    0.368421
5.0     28    0.122807
4.0     21    0.092105
2.0      4    0.017544
disposition       count  percentage
good    154    0.675439
bad      74    0.324561
gender         count  percentage
female    150    0.657895
male       78    0.342105
htn      count  percentage
1.0    129    0.565789
0.0     99    0.

In [5]:
# make a new column called 'traintest'
# if the record_id is found in SAHtrain, assign 1 but if the record_id is found in SAHtest, assign 0
df = SAHtrain.concat(SAHtest)
df.info()

#df = df.assign(traintest = df['record_id'].isin(SAHtrain['record_id']).astype(int))
#df['hh'] = df['hh'].replace([6],[5])

#for x in var:
#    counts = df[x].value_counts()
#    percs = df[x].value_counts(normalize = True)
#    print(x, pd.concat([counts,percs], axis=1, keys=['count', 'percentage']))

#print('mean age =', df.age.mean(),'std dev =', df.age.std())

#ages = sp.f_oneway(df['age'],
#               df['age'][df['traintest'] == 1],
#               df['age'][df['traintest'] == 0])
#print(ages)

#for x in var:
#    x_stat, x_p, x_dof, x_expected = sp.chi2_contingency(pd.crosstab(df['traintest'], df[x]))
#    print(x, x_stat, x_dof, x_p)
#    print(pd.DataFrame(x_expected))

AttributeError: 'DataFrame' object has no attribute 'concat'

array([1., 3., 4., 5., 2.])