Metas en este cuaderno:
* encontrar funciones que permita establecer las variables X = expuestos y no expuestos
* establecer la variable D, donde D=1 denota controles y D=0 los casos.
* incoporar las variables C, que son el conjunto de covariables.
* tendremos un indicador binario S, donde S=1 significa que la unidad esta presente en la muestra.

In [1]:
import sys
assert sys.version_info >= (3, 5)
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') # Para evitar los molestos avisos.
%matplotlib inline

In [2]:
import scipy.stats as stats
import seaborn as sns
from sklearn.cluster import KMeans 

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [62]:
from sklearn.preprocessing import OneHotEncoder # importo el modulo para crear el objeto OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [24]:
import pandas as pd
DATA_PATH = "../data/union/End"
def load_data_prep(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "dataset_final.csv")
    return pd.read_csv(csv_path)
def load_data_demo(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "dataset_demog.csv")
    return pd.read_csv(csv_path)

In [25]:
# Cargar DataFrame
df = load_data_prep()
df_dem = load_data_demo()
df1 = df.merge(df_dem, on="cod", how="left") # unimos las tablas con indice en "cod" y df_dem es incluida a la tabla df


In [27]:
# Especificar las variables predictoras y la variable objetivo
predictors = ['ccbd_qa', 'age', 'ccbd_diameter']  # 'gender' se eliminará temporalmente para la codificación one-hot
outcome = 'label'
 # obtenemos la mariz binaria

Categorias del encoder: [array(['F', 'M'], dtype=object)]


In [57]:
# Codificación One-Hot para la variable 'gender'
df1 = pd.get_dummies(df1, columns=['gender'])
x_simple = df1[predictors + ["gender_F","gender_M"]]
my_r = x_simple.corr(method="spearman")
print(my_r)

                ccbd_qa       age  ccbd_diameter  gender_F  gender_M
ccbd_qa        1.000000 -0.115373       0.159116  0.110405 -0.110405
age           -0.115373  1.000000      -0.237001  0.114119 -0.114119
ccbd_diameter  0.159116 -0.237001       1.000000 -0.120694  0.120694
gender_F       0.110405  0.114119      -0.120694  1.000000 -1.000000
gender_M      -0.110405 -0.114119       0.120694 -1.000000  1.000000


In [37]:
# Esto es exactamente igual que get_dummies para variables que transformadas son binarias 
#encoder = OneHotEncoder(sparse=False) # armamos el encoder. sparse=False nos devuelve una matriz comun a la que estamos acostumbrados (en vez de esparsa)
#gender =  pd.DataFrame(df1['gender'])
#encoder.fit(np.array(gender).reshape(-1,1)) # fiteo reshapeando el vector y_train como veníamos haciendo pues no tiene formato de matriz
#Gender = pd.DataFrame(encoder.transform(np.array(gender).reshape(-1,1)))

In [60]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df1[predictors + ['gender_F', 'gender_M']], df[outcome], test_size=0.2, random_state=42)

# Lista de penalizaciones a probar
penalties = ['l1', 'l2', 'elasticnet', 'none']


In [63]:
std_scale = StandardScaler() # Creamos el estandarizador para usarlo posteriormente

# Ajustamos el estandarizador
std_scale.fit(X_test)
std_scale.fit(X_train)

# Aplicamos el estandarizador y obtenemos la matriz de features escaleados
X_test_scaled = std_scale.transform(X_test)
X_scaled = std_scale.transform(X_train)

In [65]:
model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=10000)

In [66]:
# Entrenar el modelo
model.fit(X_scaled, y_train)

# Predecir las etiquetas en el conjunto de prueba
y_pred = model.predict(X_test_scaled)

# Informe de clasificación
print("Informe de Clasificación:")
print(classification_report(y_test, y_pred))

# AUC-ROC
y_prob = model.predict_proba(X_test_scaled)[:, 1]
auc_roc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc_roc}")
print("\n")

Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56         9
           1       0.64      0.64      0.64        11

    accuracy                           0.60        20
   macro avg       0.60      0.60      0.60        20
weighted avg       0.60      0.60      0.60        20

AUC-ROC: 0.6868686868686869




In [67]:
# Entrenar el modelo
model.fit(X_train, y_train)

# Predecir las etiquetas en el conjunto de prueba
y_pred = model.predict(X_test)

# Informe de clasificación
print("Informe de Clasificación:")
print(classification_report(y_test, y_pred))

# AUC-ROC
y_prob = model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc_roc}")
print("\n")

Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.83      0.56      0.67         9
           1       0.71      0.91      0.80        11

    accuracy                           0.75        20
   macro avg       0.77      0.73      0.73        20
weighted avg       0.77      0.75      0.74        20

AUC-ROC: 0.7777777777777778




In [83]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [97]:
model_ = model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=10000)

In [101]:
validation_size = 0.20
seed = 42
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(df1[predictors + ['gender_F', 'gender_M']], df1[outcome], test_size=validation_size, random_state=seed)

In [106]:
model_.fit(X_train, Y_train)

In [102]:
name='Logistic Regression'
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
cv_results = model_selection.cross_val_score(model_, X_train, Y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)

Logistic Regression: 0.530357 (0.168473)


In [107]:
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))

0.75


In [109]:
print(confusion_matrix(Y_validation, predictions))

[[ 5  4]
 [ 1 10]]


In [110]:
# Informe de clasificación
print("Informe de Clasificación:")
print(classification_report(Y_validation, predictions))

Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.83      0.56      0.67         9
           1       0.71      0.91      0.80        11

    accuracy                           0.75        20
   macro avg       0.77      0.73      0.73        20
weighted avg       0.77      0.75      0.74        20



In [68]:

regLog = LogisticRegression(penalty = 'none') # Inicializamos nuevamente el modelo
regLog.fit(X_scaled, y_train) # Entrenar el modelo

# Predecir las etiquetas en el conjunto de prueba
y_pred = regLog.predict(X_test_scaled)

# Informe de clasificación
print("Informe de Clasificación:")
print(classification_report(y_test, y_pred))

# AUC-ROC
y_prob = regLog.predict_proba(X_test_scaled)[:, 1]
auc_roc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc_roc}")
print("\n")

Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56         9
           1       0.64      0.64      0.64        11

    accuracy                           0.60        20
   macro avg       0.60      0.60      0.60        20
weighted avg       0.60      0.60      0.60        20

AUC-ROC: 0.7070707070707071




In [70]:
from sklearn.preprocessing import RobustScaler
std_robust = RobustScaler() # Creamos el estandarizador para usarlo posteriormente

# Ajustamos el estandarizador
std_robust.fit(X_test)
std_robust.fit(X_train)

# Aplicamos el estandarizador y obtenemos la matriz de features escaleados
X_test_stdrob = std_robust.transform(X_test)
X_stdrob = std_robust.transform(X_train)

In [71]:

regLog = LogisticRegression(penalty = 'none') # Inicializamos nuevamente el modelo
regLog.fit(X_stdrob, y_train) # Entrenar el modelo

# Predecir las etiquetas en el conjunto de prueba
y_pred = regLog.predict(X_test_stdrob)

# Informe de clasificación
print("Informe de Clasificación:")
print(classification_report(y_test, y_pred))

# AUC-ROC
y_prob = regLog.predict_proba(X_test_stdrob)[:, 1]
auc_roc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc_roc}")
print("\n")

Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56         9
           1       0.64      0.64      0.64        11

    accuracy                           0.60        20
   macro avg       0.60      0.60      0.60        20
weighted avg       0.60      0.60      0.60        20

AUC-ROC: 0.7070707070707071




In [75]:
X_stdrob

array([[-0.4821762 , -0.70588235, -0.32721347,  1.        , -1.        ],
       [ 0.5308127 ,  0.58823529,  0.49570338,  0.        ,  0.        ],
       [ 0.00513857, -0.58823529,  0.41433632,  1.        , -1.        ],
       [ 0.34254494, -0.23529412, -0.63049069,  1.        , -1.        ],
       [ 0.7578397 , -0.58823529, -1.39633501,  0.        ,  0.        ],
       [-0.8735076 ,  0.47058824,  0.00729298,  1.        , -1.        ],
       [-0.07860819, -0.47058824,  0.00678444,  0.        ,  0.        ],
       [-1.15977816,  0.58823529, -0.54029345,  0.        ,  0.        ],
       [-1.11211139, -0.58823529, -1.47918148,  0.        ,  0.        ],
       [-0.76578749,  0.47058824, -0.8221887 ,  0.        ,  0.        ],
       [ 0.6645156 ,  1.        , -0.26595702,  1.        , -1.        ],
       [ 1.14319276,  0.64705882, -1.49906093,  0.        ,  0.        ],
       [ 0.37092707, -0.41176471,  0.23343331,  1.        , -1.        ],
       [-0.24816114, -0.11764706,  0.5

In [72]:
X_train

Unnamed: 0,ccbd_qa,age,ccbd_diameter,gender_F,gender_M
40,0.186186,21,26.7297,True,False
67,0.287513,43,30.2897,False,True
15,0.234931,23,29.9377,True,False
68,0.268681,29,25.4177,True,False
88,0.310222,23,22.1046,False,True
...,...,...,...,...,...
60,0.227764,41,24.5296,True,False
71,0.218639,40,30.9252,False,True
14,0.058901,49,19.1773,False,True
92,0.331515,29,28.1051,False,True


In [None]:
regLog = LogisticRegression(penalty = 'none') # Inicializamos nuevamente el modelo
regLog.fit(X, y) # Ajustamos el modelo con los parámetros
score = regLog.score(X,y) # Calculamos el score
beta_0 = regLog.intercept_ # El beta 0
beta_1 = regLog.coef_[0][0] # El coeficiente beta_1
beta_2 = regLog.coef_[0][1] # El coeficiente beta_2