In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(130298)

In [2]:
%pwd

'C:\\Users\\Eleonora\\statistical learning for healthcare data\\heart-failure-project'

In [26]:
data = pd.read_csv('example.csv')
data.set_index('inpatient.number', inplace = True)
data.shape

(2008, 108)

In [3]:
categorical_columns = ['DestinationDischarge','admission.ward','admission.way','occupation','discharge.department','visit.times',
                       'gender', 'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade','CCI.score',
                       'consciousness','eye.opening','verbal.response', 'movement','oxygen.inhalation', 'outcome.during.hospitalization',
                       'GCS','ageCat']

prova = categorical_columns.copy()
binary_columns = ['myocardial.infarction', 'congestive.heart.failure', 'peripheral.vascular.disease',
                  'cerebrovascular.disease', 'dementia', 'Chronic.obstructive.pulmonary.disease',
                  'peptic.ulcer.disease', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  'solid.tumor', 'liver.disease', 'type.II.respiratory.failure', 'death.within.28.days', 're.admission.within.28.days', 
                  'death.within.3.months','re.admission.within.3.months', 'death.within.6.months', 're.admission.within.6.months',
                  'return.to.emergency.department.within.6.months','respiratory.support.']
# NB 'respiratory.support.' is in the binary variables because we will rebin it in 0/1

prova.extend(binary_columns)
continuous_columns = [col_name for col_name in data.columns if col_name not in prova]


We perform a split by taking the readmission in 6 months as a reference

In [21]:
from sklearn.model_selection import train_test_split
X = data.drop(columns='re.admission.within.6.months')
y = data['re.admission.within.6.months']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.15, random_state=1, shuffle=True,stratify=y)
X_train,X_val,y_train,y_val = train_test_split(X_train, y_train, test_size=0.177, random_state=1,shuffle=True,stratify=y_train)

Remove missing values 

In [28]:
X_train.dropna(inplace = True)
X_train[continuous_columns].shape

(464, 69)

While applying StandardScaler, each feature of your data should be normally distributed such that it will scale the distribution to a mean of zero and a standard deviation of one.

In [23]:
from sklearn.preprocessing import StandardScaler
data.dropna(inplace=True)
x = X_train.loc[:, continuous_columns].values
y = y_train
x = StandardScaler().fit_transform(x) # normalizing the features

Do PCA

In [29]:
from sklearn.decomposition import PCA
pca = PCA(n_components=69)
principalComponents = pca.fit_transform(x)
pca_data = pd.DataFrame(data = principalComponents)

In [30]:
pca_data = pd.concat([pca_data, y_train], axis = 1)

In [31]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [1.03977203e-01 7.40045702e-02 6.92770034e-02 5.84380538e-02
 4.78375171e-02 3.90298687e-02 3.76704270e-02 3.44990445e-02
 3.27514460e-02 3.04746597e-02 2.75063684e-02 2.51235598e-02
 2.29380371e-02 2.22221004e-02 2.07259403e-02 2.05098316e-02
 1.94143468e-02 1.86469913e-02 1.75116047e-02 1.68238348e-02
 1.55504437e-02 1.49228172e-02 1.35716311e-02 1.34209294e-02
 1.29215379e-02 1.22232489e-02 1.19547258e-02 1.15179144e-02
 1.05046176e-02 9.78474845e-03 9.63789815e-03 9.56890045e-03
 8.81690564e-03 8.29980477e-03 8.12409948e-03 7.29744116e-03
 7.14575161e-03 6.51996865e-03 6.37220029e-03 5.57979673e-03
 5.48591635e-03 5.05722846e-03 4.71316162e-03 4.41713746e-03
 4.31971552e-03 4.04062232e-03 3.70916237e-03 3.45245306e-03
 3.08923612e-03 2.89779139e-03 2.78747988e-03 2.50034980e-03
 2.36915085e-03 1.98866684e-03 1.58393154e-03 1.21376237e-03
 9.27459570e-04 8.07364650e-04 5.72568564e-04 3.73926593e-04
 2.79123893e-04 1.86914888e-04 7.5956467

You will use the sklearn library to import the PCA module, and in the PCA method, you will pass the number of components (n_components=2) and finally call fit_transform on the aggregate data. Here, several components represent the lower dimension in which you will project your higher dimension data.



In [18]:
from sklearn.decomposition import PCA
pca_data = PCA(n_components=2)
principalComponents_data = pca_data.fit_transform(x)

In [33]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [1.03977203e-01 7.40045702e-02 6.92770034e-02 5.84380538e-02
 4.78375171e-02 3.90298687e-02 3.76704270e-02 3.44990445e-02
 3.27514460e-02 3.04746597e-02 2.75063684e-02 2.51235598e-02
 2.29380371e-02 2.22221004e-02 2.07259403e-02 2.05098316e-02
 1.94143468e-02 1.86469913e-02 1.75116047e-02 1.68238348e-02
 1.55504437e-02 1.49228172e-02 1.35716311e-02 1.34209294e-02
 1.29215379e-02 1.22232489e-02 1.19547258e-02 1.15179144e-02
 1.05046176e-02 9.78474845e-03 9.63789815e-03 9.56890045e-03
 8.81690564e-03 8.29980477e-03 8.12409948e-03 7.29744116e-03
 7.14575161e-03 6.51996865e-03 6.37220029e-03 5.57979673e-03
 5.48591635e-03 5.05722846e-03 4.71316162e-03 4.41713746e-03
 4.31971552e-03 4.04062232e-03 3.70916237e-03 3.45245306e-03
 3.08923612e-03 2.89779139e-03 2.78747988e-03 2.50034980e-03
 2.36915085e-03 1.98866684e-03 1.58393154e-03 1.21376237e-03
 9.27459570e-04 8.07364650e-04 5.72568564e-04 3.73926593e-04
 2.79123893e-04 1.86914888e-04 7.5956467

In [34]:
np.cumsum(pca.explained_variance_ratio_)

array([0.1039772 , 0.17798177, 0.24725878, 0.30569683, 0.35353435,
       0.39256422, 0.43023464, 0.46473369, 0.49748513, 0.52795979,
       0.55546616, 0.58058972, 0.60352776, 0.62574986, 0.6464758 ,
       0.66698563, 0.68639998, 0.70504697, 0.72255857, 0.73938241,
       0.75493285, 0.76985567, 0.7834273 , 0.79684823, 0.80976977,
       0.82199302, 0.83394774, 0.84546566, 0.85597027, 0.86575502,
       0.87539292, 0.88496182, 0.89377873, 0.90207853, 0.91020263,
       0.91750007, 0.92464582, 0.93116579, 0.93753799, 0.94311779,
       0.94860371, 0.95366094, 0.9583741 , 0.96279123, 0.96711095,
       0.97115157, 0.97486073, 0.97831319, 0.98140242, 0.98430021,
       0.98708769, 0.98958804, 0.9919572 , 0.99394586, 0.99552979,
       0.99674356, 0.99767102, 0.99847838, 0.99905095, 0.99942488,
       0.999704  , 0.99989091, 0.99996687, 0.99999133, 0.99999979,
       1.        , 1.        , 1.        , 1.        ])

In [36]:
np.where(cum_expl_var>0.7)

(array([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68]),)