# Logistic regression
In this notebook we perform binary logistic regression on the diabetes dataset. Our analysis involves the following steps:
- initial exploration (plain )
- cross validation
- 

In [1]:
# Imports 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, classification_report
import os
import sys

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

## Initial exploration

In [2]:
# Load data
data_loader = DataLoader()
X_train, y_train  = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (202944, 21)
y_train shape: (202944,)
X_val shape: (25368, 21)
y_val shape: (25368,)
X_test shape: (25368, 21)
y_test shape: (25368,)


In [3]:
type(y_train)

pandas.core.series.Series

In [4]:
# Train first logistic regression model

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the preprocessed training data
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model's performance
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.8482734153263954
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     21371
         1.0       0.55      0.19      0.29      3997

    accuracy                           0.85     25368
   macro avg       0.71      0.58      0.60     25368
weighted avg       0.82      0.85      0.82     25368



In [5]:
# Test out different penalty terms --> also see hyperparameter tuning later

# model = LogisticRegression(max_iter=1000, random_state=42, penalty=None)
# model = LogisticRegression(max_iter=1000, random_state=42, penalty= "l1", solver="liblinear")
model = LogisticRegression(max_iter=1000, random_state=42, penalty= "elasticnet", solver="saga", l1_ratio=0.5)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.8481945758435825
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.97      0.92     21371
         1.0       0.55      0.19      0.29      3997

    accuracy                           0.85     25368
   macro avg       0.71      0.58      0.60     25368
weighted avg       0.82      0.85      0.82     25368



### resampling methods

In [6]:
# test random undersampling
X_train_undersampling_random, y_train_undersampling_random = data_loader.training_data_undersampling_random
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_undersampling_random shape: {X_train_undersampling_random.shape}")
print(f"y_train_undersampling_random shape: {y_train_undersampling_random.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_undersampling_random shape: (63964, 21)
y_train_undersampling_random shape: (63964,)
X_val shape: (25368, 21)
y_val shape: (25368,)
X_test shape: (25368, 21)
y_test shape: (25368,)


In [7]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_undersampling_random, y_train_undersampling_random)

y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.7295411542100284
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.72      0.82     21371
         1.0       0.34      0.76      0.47      3997

    accuracy                           0.73     25368
   macro avg       0.64      0.74      0.64     25368
weighted avg       0.85      0.73      0.76     25368



In [8]:
# test random oversampling
X_train_oversampling_random, y_train_oversampling_random = data_loader.training_data_oversampling_random
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_random shape: {X_train_oversampling_random.shape}")
print(f"y_train_oversampling_random shape: {y_train_oversampling_random.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_oversampling_random shape: (341924, 21)
y_train_oversampling_random shape: (341924,)
X_val shape: (25368, 21)
y_val shape: (25368,)
X_test shape: (25368, 21)
y_test shape: (25368,)


In [9]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_oversampling_random, y_train_oversampling_random)

y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.7284768211920529
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.72      0.82     21371
         1.0       0.34      0.76      0.47      3997

    accuracy                           0.73     25368
   macro avg       0.64      0.74      0.64     25368
weighted avg       0.85      0.73      0.76     25368



In [10]:
# test smote oversampling
X_train_oversampling_smote, y_train_oversampling_smote = data_loader.training_data_oversampling_smote
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_smote shape: {X_train_oversampling_smote.shape}")
print(f"y_train_oversampling_smote shape: {y_train_oversampling_smote.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_oversampling_smote shape: (341924, 21)
y_train_oversampling_smote shape: (341924,)
X_val shape: (25368, 21)
y_val shape: (25368,)
X_test shape: (25368, 21)
y_test shape: (25368,)


In [11]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_oversampling_smote, y_train_oversampling_smote)

y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.7296988331756543
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.73      0.82     21371
         1.0       0.34      0.75      0.47      3997

    accuracy                           0.73     25368
   macro avg       0.64      0.74      0.64     25368
weighted avg       0.85      0.73      0.76     25368



## Hyperparameter Tuning
The logistic regression model from sklearn has the following hyperparameters: 
- ...

In [None]:
# Hyperparameter tuning with Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag'],  # Optimization algorithm
    # 'max_iter': [100, 200, 500, 1000],  # Number of iterations
    'tol': [1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria
    'l1_ratio': [0.0, 0.1, 0.5, 0.9, 1.0]  # Only used if penalty is 'elasticnet' and solver is 'saga'
}

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Initialize the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Choose the search method - here we're using GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Or use other scoring metrics like 'f1', 'roc_auc' depending on your goals
    n_jobs=-1,  # Use all available processors
    verbose=1  # To see the progress
)

grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

In [None]:
# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag'],  # Optimization algorithm
    # 'max_iter': [100, 200, 500, 1000],  # Number of iterations
    'tol': [1e-4, 1e-3, 1e-2, 1e-1],  # Tolerance for stopping criteria
    'l1_ratio': [0.0, 0.1, 0.5, 0.9, 1.0]  # Only used if penalty is 'elasticnet' and solver is 'saga'
}

# Initialize the model
model = LogisticRegression(random_state=42, max_iter=1000)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,  # Parameter grid remains the same
    n_iter=100,  # Number of random parameter combinations to try
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Or other scoring metric of choice
    n_jobs=-1,  # Use all processors
    verbose=1  # To track progress
)

# Fit the random search on training data
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


300 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I549663/Library/CloudStorage/OneDrive-Personal/Dokumente/Studium/MMDS/Kurse/IE500_DataMining/Project/ie500_data_mining_project/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I549663/Library/CloudStorage/OneDrive-Personal/Dokumente/Studium/MMDS/Kurse/IE500_DataMining/Project/ie500_data_mining_project/venv/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/I549663/Library/CloudStorag

Best Parameters: {'tol': 0.1, 'solver': 'sag', 'penalty': 'l2', 'l1_ratio': 0.1, 'C': 0.01}
Best Cross-Validation Accuracy: 0.8477166171701086


In [13]:
best_model = random_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation Accuracy: 0.8478003784295175
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.97      0.91     21371
         1.0       0.54      0.21      0.30      3997

    accuracy                           0.85     25368
   macro avg       0.71      0.59      0.61     25368
weighted avg       0.82      0.85      0.82     25368



In [14]:
# TODO: improve grid search / randomized search, implement successive halfing

## Logistic Regression with PCA

In [15]:
# TODO

## Other stuff

In [None]:
# some notes: The “lbfgs” solver is used by default for its robustness. For large datasets the “saga” solver is usually faster.
# look at scores for resampled data. maybe decide to proceed with only resampled data