In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [137]:
data_path = '../data/fallo_cardiaco.csv'
df = pd.read_csv(data_path)

In [138]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [139]:
labels = df['DEATH_EVENT']
data = df.drop(['DEATH_EVENT'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, stratify=labels, random_state=42)

print(f'Tama単o conjunto entrenamiento: {X_train.shape} ; labels: {y_train.shape}')
print(f'Tama単o conjunto de test: {X_test.shape} ; labels: {y_test.shape}')

Tama単o conjunto entrenamiento: (269, 12) ; labels: (269,)
Tama単o conjunto de test: (30, 12) ; labels: (30,)


In [140]:
num_deads_train = sum(y_train == 1)
num_deads_test = sum(y_test == 1)
print(f'Porcentaje de muertes en train: {(num_deads_train/X_train.shape[0])}')
print(f'Porcentaje de muertes en test: {(num_deads_test/X_test.shape[0])}')

Porcentaje de muertes en train: 0.31970260223048325
Porcentaje de muertes en test: 0.3333333333333333


In [141]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [142]:
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train_poly)
X_test_std = std_scaler.transform(X_test_poly)

In [143]:
print(X_train_std.shape)

(269, 91)


In [144]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

In [145]:
print(X_train_pca.shape)

(269, 22)


In [146]:
logreg = LogisticRegression()

logreg.fit(X_train_pca, y_train)
pred = logreg.predict(X_test_pca)

print(accuracy_score(pred, y_test))

0.8333333333333334
