# Cross Validation

In [24]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_validate, KFold, ShuffleSplit
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from ISLP.models import sklearn_sm

In [2]:
heart_attack = pd.read_csv('../02-logistic-regression/data/Medicaldataset.csv')

In [3]:
heart_attack

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,64,1,66,160,83,160.0,1.80,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.060,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,negative
1315,66,1,84,125,55,149.0,1.33,0.172,positive
1316,45,1,85,168,104,96.0,1.24,4.250,positive
1317,54,1,58,117,68,443.0,5.80,0.359,positive


In [4]:
X = heart_attack.drop(columns='Result')
y = heart_attack['Result']
y = y.map({'negative':0, 'positive':1})

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

In [7]:
scaler = StandardScaler()

In [8]:
def Pipeline(X, y, scaler):
    X_scaled = pd.DataFrame(scaler.fit_transform(X, y), columns=X.columns, index=X.index)
    return X_scaled, scaler

In [10]:
X_scaled, scaler = Pipeline(X_train, y_train, scaler)
model = sm.GLM(y_train, X_scaled, family=sm.families.Binomial()).fit()
print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Result   No. Observations:                  659
Model:                            GLM   Df Residuals:                      651
Model Family:                Binomial   Df Model:                            7
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -395.64
Date:                Mon, 02 Jun 2025   Deviance:                       791.28
Time:                        16:50:30   Pearson chi2:                 1.78e+03
No. Iterations:                     6   Pseudo R-squ. (CS):             0.1164
Covariance Type:            nonrobust                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Age                     

In [11]:
vals = [VIF(X_scaled, i) for i in range(1, X_scaled.shape[1])]
vif = pd.DataFrame({'vif':vals},
                   index=X_scaled.columns[1:])
vif

Unnamed: 0,vif
Gender,1.006368
Heart rate,1.017856
Systolic blood pressure,1.642202
Diastolic blood pressure,1.648465
Blood sugar,1.014557
CK-MB,1.004522
Troponin,1.019892


In [12]:
X_test_scaled = scaler.transform(X_test)

In [14]:
preds = model.predict(X_test_scaled)

In [16]:
labels = np.array([0]*660 )
labels[preds>0.5] = 1
np.mean(labels == y_test)

0.6742424242424242

## Reduction

In [17]:
x_reduced = X_train.drop(columns=['Diastolic blood pressure', 'Systolic blood pressure', 'Heart rate'])
scaler_reduced = StandardScaler()
x_reduced_scaled, scaler_reduced = Pipeline(x_reduced, y_train, scaler_reduced)
model_reduced = sm.GLM(y_train, x_reduced_scaled, family=sm.families.Binomial()).fit()
print(model_reduced.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Result   No. Observations:                  659
Model:                            GLM   Df Residuals:                      654
Model Family:                Binomial   Df Model:                            4
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -396.39
Date:                Mon, 02 Jun 2025   Deviance:                       792.78
Time:                        16:57:44   Pearson chi2:                 1.83e+03
No. Iterations:                     6   Pseudo R-squ. (CS):             0.1144
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Age             0.4767      0.090      5.287      

In [19]:
x_test_reduced = X_test.drop(columns=['Diastolic blood pressure', 'Systolic blood pressure', 'Heart rate'])
x_test_reduced_scaled = scaler_reduced.transform(x_test_reduced)

In [20]:
preds = model_reduced.predict(x_test_reduced_scaled)

In [21]:
labels = np.array([0]*660 )
labels[preds>0.5] = 1
np.mean(labels == y_test)

0.6878787878787879

## Cross Validation 
Taking a look through Scikit-Learn, there is a function calling LogisticRegressionCV which I will try out. It seem to fit everything that I want to test

In [49]:
X_reduced = X.drop(columns=['Diastolic blood pressure', 'Systolic blood pressure', 'Heart rate'])
model = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))

In [52]:
cv_results = cross_validate(model, X_reduced, y, cv=5, verbose=3)
print(cv_results['test_score'].mean())

[CV] END ......................................., score=0.803 total time=   0.0s
[CV] END ......................................., score=0.780 total time=   0.0s
[CV] END ......................................., score=0.788 total time=   0.0s
[CV] END ......................................., score=0.788 total time=   0.0s
[CV] END ......................................., score=0.798 total time=   0.0s
0.7915139993086762


In [51]:
cv = KFold(n_splits=10, shuffle=True, random_state=0)
cv_results2 = cross_validate(model, X_reduced, y, cv=cv, verbose=3)
print(cv_results['test_score'].mean())

[CV] END ......................................., score=0.765 total time=   0.0s
[CV] END ......................................., score=0.803 total time=   0.0s
[CV] END ......................................., score=0.848 total time=   0.0s
[CV] END ......................................., score=0.735 total time=   0.0s
[CV] END ......................................., score=0.773 total time=   0.0s
[CV] END ......................................., score=0.811 total time=   0.0s
[CV] END ......................................., score=0.818 total time=   0.0s
[CV] END ......................................., score=0.765 total time=   0.0s
[CV] END ......................................., score=0.811 total time=   0.0s
[CV] END ......................................., score=0.756 total time=   0.0s
0.7915139993086762


In [56]:
validation = ShuffleSplit(n_splits=5, test_size=196)
results = cross_validate(model, X_reduced, y, cv=validation)
print(results['test_score'].mean())

0.7826530612244899
