# Import data

In [2]:
import pandas as pd
import os
import sys
from sklearn.metrics import accuracy_score, classification_report

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

data_loader = DataLoader()
X_train, y_train = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

# Exporting models

In [3]:
import joblib
from datetime import datetime

# Save model to pkl file for later reuse
def save_model (model, model_name):
    # Get the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save the best model to a file with a timestamp
    model_filename = f'../models/baseline/baseline_model_{model_name}_{timestamp}.pkl'
    joblib.dump(model, model_filename)

    print(f"Initial model saved to '{model_filename}'")

All Baselines are on the unsampled, whole dataset.

# Baseline: Majority class

Standard baseline.

"most frequent": Always picks the most common class.

The majority class for our dataset is "no diabetes".

In [4]:
from sklearn.dummy import DummyClassifier

# Initialize the support vector machine model
baseline_majority = DummyClassifier(strategy="most_frequent")

# Train the model on the preprocessed training data
baseline_majority.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_majority = baseline_majority.predict(X_val)

# Evaluate the model's performance
accuracy_majority = accuracy_score(y_val, y_val_pred_majority)
report_majority = classification_report(y_val, y_val_pred_majority)

print(f"Validation Accuracy: {accuracy_majority}")
print("Classification Report:\n", report_majority)

Validation Accuracy: 0.8423961352657005
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     21797
         1.0       0.00      0.00      0.00      4078

    accuracy                           0.84     25875
   macro avg       0.42      0.50      0.46     25875
weighted avg       0.71      0.84      0.77     25875



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
save_model(baseline_majority, "majority")

Initial model saved to '../models/baseline/baseline_model_majority_20241129_225242.pkl'


# Baseline: Distribution

Another standard baseline.

"stratified": Makes random guesses based on the original class distribution.

That is in our case: 
Predicts no-diabetes with a probability of 218334/253680 = 0.86
and diabetes with a probability of 35346/253680 = 0.14

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the support vector machine model
baseline_stratified = DummyClassifier(strategy="stratified")

# Train the model on the preprocessed training data
baseline_stratified.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_stratified = baseline_stratified.predict(X_val)

# Evaluate the model's performance
accuracy_stratified = accuracy_score(y_val, y_val_pred_stratified)
report_stratified = classification_report(y_val, y_val_pred_stratified)

print(f"Validation Accuracy: {accuracy_stratified}")
print("Classification Report:\n", report_stratified)

Validation Accuracy: 0.7394396135265701
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.85      0.85     21797
         1.0       0.17      0.16      0.16      4078

    accuracy                           0.74     25875
   macro avg       0.51      0.50      0.51     25875
weighted avg       0.74      0.74      0.74     25875



The Distribution baseline has a lower accuracy than the Majority Class baseline. But for the positive class (i.e., diabetes) precision, recall and f1-score are significantly higher, because they are 0 for Majority Class. Correspondingly, recall and f1-score of the negative class (i.e., no-diabetes) decreased from the Majority Class to Distribution baseline.

In [7]:
save_model(baseline_stratified, "stratified")

Initial model saved to '../models/baseline/baseline_model_stratified_20241129_225334.pkl'


# Baseline: Use feature with highest correlation

The feature that has the highest correlation with our target (diabetes) is GenHealth (with 0.29). This correlation is still far below 0.5. Thus, we can be sure that it is no false predictor. 

We pick the feature having the highest correlation with our target (diabetes). 
Then, we calculate the mean of the target variable for each the ordinal values for that selected feature.
Where we find the highest difference/gap between these means, we put our decision boundary.
For this feature's values below that decision boundary, diabetes is predicted.
For this feature's values above that decision boundary, no diabetes is predicted.

In [8]:
import pandas as pd

# Calculate correlation of each feature in X_train with y_train
correlations = X_train.apply(lambda x: x.corr(y_train))
# Find the feature with the highest correlation
most_correlated_feature = correlations.idxmax()
print(f"Most correlated feature: {most_correlated_feature}")
    
# Get unique values of this feature
unique_values = X_train[most_correlated_feature].dropna().unique()
means = {}

# Combine X_train and y_train into a single DataFrame
combined_df = pd.concat([X_train, y_train.rename('y_train')], axis=1)
# Rename negative labels from 0 to -1

# Group by the unique values in the specified column and calculate the mean of y_train
# Get correlation with GenHlth for each of its values
for unique_val in unique_values:
    #correlations = combined_df[genhlth_dummies.columns].corrwith(combined_df['Diabetes_binary'])
    subset = combined_df[combined_df[most_correlated_feature] == unique_val]
    mu = subset['y_train'].mean()
    means[unique_val] = mu

# Sort dictionary of correlations increasingly by its key
means = dict(sorted(means.items()))
keys = list(means.keys())
values = list(means.values())
print(f"Means (grouped by class): {means}")

# Find the largest gap
largest_gap = 0
indices_largest_gap = (None, None)
for i in range(len(values) - 1):
    gap = values[i+1] - values[i]
    if gap > largest_gap:
        largest_gap = gap
        indices_largest_gap = (keys[i], keys[i+1])

# Switch two values of largest gap 
# so that the first corresponds to no diabetes and the second to diabetes
if means[indices_largest_gap[0]] > means[indices_largest_gap[1]]:
    helper_switch = indices_largest_gap[0]
    indices_largest_gap[0] = indices_largest_gap[1]
    indices_largest_gap[1] = helper_switch
print(f"Indices of the largest gap: {indices_largest_gap}\n")

# Make prediction on validation set
y_val_pred_one_feature = pd.DataFrame({
    'Target': X_val[most_correlated_feature].apply(lambda x: 1 if x >= indices_largest_gap[1] else 0)
})

# Evaluate the model's performance
accuracy_one_feature = accuracy_score(y_val, y_val_pred_one_feature)
report_one_feature = classification_report(y_val, y_val_pred_one_feature)

print(f"Validation Accuracy: {accuracy_one_feature}")
print("Classification Report:\n", report_one_feature)

Most correlated feature: GenHlth
Means (grouped by class): {1.0: 0.031711541489529205, 2.0: 0.08568687030225491, 3.0: 0.20092068122833273, 4.0: 0.342883087400681, 5.0: 0.4081849274678618}
Indices of the largest gap: (3.0, 4.0)

Validation Accuracy: 0.7940483091787439
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.87      0.88     21797
         1.0       0.36      0.40      0.38      4078

    accuracy                           0.79     25875
   macro avg       0.62      0.63      0.63     25875
weighted avg       0.80      0.79      0.80     25875



### One-Feature Evaluation on Testset

In [3]:
import pandas as pd

# Calculate correlation of each feature in X_train with y_train
correlations = X_train.apply(lambda x: x.corr(y_train))
# Find the feature with the highest correlation
most_correlated_feature = correlations.idxmax()
print(f"Most correlated feature: {most_correlated_feature}")
    
# Get unique values of this feature
unique_values = X_train[most_correlated_feature].dropna().unique()
means = {}

# Combine X_train and y_train into a single DataFrame
combined_df = pd.concat([X_train, y_train.rename('y_train')], axis=1)
# Rename negative labels from 0 to -1

# Group by the unique values in the specified column and calculate the mean of y_train
# Get correlation with GenHlth for each of its values
for unique_val in unique_values:
    #correlations = combined_df[genhlth_dummies.columns].corrwith(combined_df['Diabetes_binary'])
    subset = combined_df[combined_df[most_correlated_feature] == unique_val]
    mu = subset['y_train'].mean()
    means[unique_val] = mu

# Sort dictionary of correlations increasingly by its key
means = dict(sorted(means.items()))
keys = list(means.keys())
values = list(means.values())
print(f"Means (grouped by class): {means}")

# Find the largest gap
largest_gap = 0
indices_largest_gap = (None, None)
for i in range(len(values) - 1):
    gap = values[i+1] - values[i]
    if gap > largest_gap:
        largest_gap = gap
        indices_largest_gap = (keys[i], keys[i+1])

# Switch two values of largest gap 
# so that the first corresponds to no diabetes and the second to diabetes
if means[indices_largest_gap[0]] > means[indices_largest_gap[1]]:
    helper_switch = indices_largest_gap[0]
    indices_largest_gap[0] = indices_largest_gap[1]
    indices_largest_gap[1] = helper_switch
print(f"Indices of the largest gap: {indices_largest_gap}\n")

# Make prediction on validation set
y_val_pred_one_feature = pd.DataFrame({
    'Target': X_test[most_correlated_feature].apply(lambda x: 1 if x >= indices_largest_gap[1] else 0)
})

# Evaluate the model's performance
accuracy_one_feature = accuracy_score(y_test, y_val_pred_one_feature)
report_one_feature = classification_report(y_test, y_val_pred_one_feature)

print(f"Validation Accuracy: {accuracy_one_feature}")
print("Classification Report:\n", report_one_feature)

Most correlated feature: GenHlth
Means (grouped by class): {np.float64(1.0): np.float64(0.031711541489529205), np.float64(2.0): np.float64(0.08568687030225491), np.float64(3.0): np.float64(0.20092068122833273), np.float64(4.0): np.float64(0.342883087400681), np.float64(5.0): np.float64(0.4081849274678618)}
Indices of the largest gap: (np.float64(3.0), np.float64(4.0))

Validation Accuracy: 0.7939636464990344
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.87      0.88     42314
         1.0       0.36      0.39      0.37      7915

    accuracy                           0.79     50229
   macro avg       0.62      0.63      0.63     50229
weighted avg       0.80      0.79      0.80     50229



With ChatGPT, to transform it into a pkl file:

In [9]:
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

class OneFeatureModel:
    def __init__(self, feature, threshold):
        """
        Initialize with the selected feature and threshold.
        """
        self.feature = feature
        self.threshold = threshold

    def predict(self, X):
        """
        Predict labels based on the feature and threshold.
        """
        return X[self.feature].apply(lambda x: 1 if x >= self.threshold else 0)

    def evaluate(self, X_val, y_val):
        """
        Evaluate the model on validation data.
        """
        y_val_pred = self.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        report = classification_report(y_val, y_val_pred)
        return accuracy, report

In [11]:
# Initialize the model with the feature and threshold from your script
baseline_one_feature = OneFeatureModel(
    feature=most_correlated_feature,
    threshold=indices_largest_gap[1]
)

In [12]:
# Save the model to a .pkl file
save_model(baseline_one_feature, "one_feature")

Initial model saved to '../models/baseline/baseline_model_one_feature_20241129_230329.pkl'


In [None]:
# TODO delete this cell, is just test/example

# Load the model
with open('../models/baseline/baseline_model_one_feature_20241129_230329.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model for prediction or evaluation
y_val_pred_test = loaded_model.predict(X_val)
accuracy_test, report_test = loaded_model.evaluate(X_val, y_val)

print(f"Validation Accuracy: {accuracy_test}")
print("Classification Report:\n", report_test)

<class '__main__.OneFeatureModel'>
Validation Accuracy: 0.7940483091787439
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.87      0.88     21797
         1.0       0.36      0.40      0.38      4078

    accuracy                           0.79     25875
   macro avg       0.62      0.63      0.63     25875
weighted avg       0.80      0.79      0.80     25875



# Baseline: KNN on First component of PCA

Is this too good for a baseline? Should we use another (more dummy) classifier than K-NN for this first PCA component?

Not a conventional baseline, but since it is not overly complicated, it is suitable as a baseline. 

In [29]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Apply PCA to reduce dimensions (you can choose the number of components, here it's set to 2 for visualization)
pca = PCA(n_components=1)  # just the first component (it's a baseline)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

# Initialize the K-NN classifier (with k=5, you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train_pca, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_val_pca)

# Measure performance
accuracy = accuracy_score(y_val, y_pred)

# Output results
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_val, y_pred))

# Explained variance by the first component
# = variance of that principal component / total variance
# = variance of that principal component / sum of variances of all individual principal components
explained_variance_ratio_first = pca.explained_variance_ratio_[0]
print(f"Explained variance by the first component: {explained_variance_ratio_first}")

Accuracy: 0.8167

Classification Report:

              precision    recall  f1-score   support

         0.0       0.85      0.95      0.90     21797
         1.0       0.26      0.09      0.13      4078

    accuracy                           0.82     25875
   macro avg       0.55      0.52      0.51     25875
weighted avg       0.76      0.82      0.78     25875

Explained variance by the first component: 0.4719632380321297


With ChatGPT, to transform it into a pkl file:

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import pickle

class PCANNModel:
    def __init__(self, n_components=1, n_neighbors=5):
        """
        Initialize the PCA + KNN model.
        """
        self.pca = PCA(n_components=n_components)
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    def fit(self, X_train, y_train):
        """
        Fit PCA and KNN on the training data.
        """
        # Apply PCA
        self.X_train_pca = self.pca.fit_transform(X_train)
        # Train K-NN on the PCA-transformed data
        self.knn.fit(self.X_train_pca, y_train)

    def predict(self, X):
        """
        Predict labels for new data.
        """
        # Transform data using PCA
        X_pca = self.pca.transform(X)
        # Predict using K-NN
        return self.knn.predict(X_pca)

    def evaluate(self, X, y_true):
        """
        Evaluate the model on validation data.
        """
        y_pred = self.predict(X)
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        return accuracy, report

    def explained_variance(self):
        """
        Get the explained variance ratio of the PCA components.
        """
        return self.pca.explained_variance_ratio_

In [35]:
# Create and train the model
baseline_pca_nn = PCANNModel(n_components=1, n_neighbors=5)
baseline_pca_nn.fit(X_train, y_train)

In [36]:
# Evaluate the model
accuracy, report = baseline_pca_nn.evaluate(X_val, y_val)
explained_variance = baseline_pca_nn.explained_variance()

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)
print(f"Explained variance by the first component: {explained_variance[0]:.4f}")

Accuracy: 0.8167

Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.95      0.90     21797
         1.0       0.26      0.09      0.13      4078

    accuracy                           0.82     25875
   macro avg       0.55      0.52      0.51     25875
weighted avg       0.76      0.82      0.78     25875

Explained variance by the first component: 0.4720


In [44]:
# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save the model to a .pkl file
with open(f'../models/baseline/baseline_model_pca_nn_{timestamp}.pkl', 'wb') as file:
    pickle.dump(baseline_pca_nn, file)
    print("Model saved to " + f'../models/baseline/baseline_model_pca_nn_{timestamp}.pkl')

Model saved to ../models/baseline/baseline_model_pca_nn_20241129_234922.pkl


In [46]:
# TODO delete this cell, is just test/example

# Load the pipeline from the .pkl file
with open('../models/baseline/baseline_model_pca_nn_20241129_234922.pkl', 'rb') as file:
    loaded_model_2 = pickle.load(file)

# Use the loaded pipeline for predictions
y_pred = loaded_model_2.predict(X_val)

# Evaluate the loaded model
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.8167

Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.95      0.90     21797
         1.0       0.26      0.09      0.13      4078

    accuracy                           0.82     25875
   macro avg       0.55      0.52      0.51     25875
weighted avg       0.76      0.82      0.78     25875



In [1]:
# TODO in report: argue why it makes sense -> Explain/use/include the theory of PCA.

Here, we do not plot the explained variance of different numbers of principal components, because using any more than one component seems to be too much for a baseline that should be kept simple.

# Conclusion

|  | Majority class | Distribution | Highest correlation | KNN on 1st Principal Component |
|----------|----------|----------|----------|----------|
| Accuracy       | 0.84     | 0.74     | 0.79     | 0.82     |
| Precision 0    | 0.84     | 0.84     | 0.89     | 0.85     |
| Precision 1    | 0     | 0.15     | 0.36     | 0.26     |
| Recall 0       | 1     | 0.85     | 0.87     | 0.95     |
| Recall 1       | 0     | 0.14     | 0.40     | 0.09     |
| F1 Score 0     | 0.91     | 0.84     | 0.88     | 0.90     |
| F1 Score 1     | 0     | 0.15     | 0.38     | 0.13     |
