# Import data

In [3]:
import pandas as pd
import os
import sys
from sklearn.metrics import accuracy_score, classification_report

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

data_loader = DataLoader()
X_train, y_train = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

# Exporting models

In [4]:
import joblib
from datetime import datetime

# Save model to pkl file for later reuse
def save_model (model, model_name):
    # Get the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save the best model to a file with a timestamp
    model_filename = f'../models/baseline/baseline_model_{model_name}_{timestamp}.pkl'
    joblib.dump(model, model_filename)

    print(f"Initial model saved to '{model_filename}'")

All Baselines are on the unsampled, whole dataset.

# Baseline: Majority class

Standard baseline.

"most frequent": Always picks the most common class.

The majority class for our dataset is "no diabetes".

In [None]:
from sklearn.dummy import DummyClassifier

# Initialize the dummy model
baseline_majority = DummyClassifier(strategy="most_frequent")

# Train the model on the preprocessed training data
baseline_majority.fit(X_train, y_train)

Evaluate on Validation data:

In [16]:
# Make predictions on the validation set
y_val_pred_majority = baseline_majority.predict(X_val)

# Evaluate the model's performance
accuracy_majority = accuracy_score(y_val, y_val_pred_majority)
report_majority = classification_report(y_val, y_val_pred_majority)

print(f"Accuracy: {accuracy_majority}")
print("Classification Report:\n", report_majority)

Accuracy: 0.8423961352657005
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     21797
         1.0       0.00      0.00      0.00      4078

    accuracy                           0.84     25875
   macro avg       0.42      0.50      0.46     25875
weighted avg       0.71      0.84      0.77     25875



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluate on Test data:

In [None]:
# Make predictions on the test set
y_test_pred_majority = baseline_majority.predict(X_test)

# Evaluate the model's performance
accuracy_majority = accuracy_score(y_test, y_test_pred_majority)
report_majority = classification_report(y_test, y_test_pred_majority)

print(f"Accuracy: {accuracy_majority}")
print("Classification Report:\n", report_majority)

Accuracy: 0.8424217085747278
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     42314
         1.0       0.00      0.00      0.00      7915

    accuracy                           0.84     50229
   macro avg       0.42      0.50      0.46     50229
weighted avg       0.71      0.84      0.77     50229



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Export of Baseline

In [5]:
save_model(baseline_majority, "majority")

Initial model saved to '../models/baseline/baseline_model_majority_20241129_225242.pkl'


# Baseline: Distribution

Another standard baseline.

"stratified": Makes random guesses based on the original class distribution.

That is in our case: 
Predicts no-diabetes with a probability of 218334/253680 = 0.86
and diabetes with a probability of 35346/253680 = 0.14

In [18]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the dummy model
baseline_stratified = DummyClassifier(strategy="stratified")

# Train the model on the preprocessed training data
baseline_stratified.fit(X_train, y_train)

Evaluate on Validation data:

In [19]:
# Make predictions on the validation set
y_val_pred_stratified = baseline_stratified.predict(X_val)

# Evaluate the model's performance
accuracy_stratified = accuracy_score(y_val, y_val_pred_stratified)
report_stratified = classification_report(y_val, y_val_pred_stratified)

print(f"Accuracy: {accuracy_stratified}")
print("Classification Report:\n", report_stratified)

Accuracy: 0.7352657004830918
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.84      0.84     21797
         1.0       0.16      0.15      0.15      4078

    accuracy                           0.74     25875
   macro avg       0.50      0.50      0.50     25875
weighted avg       0.73      0.74      0.73     25875



Evaluate on Test data:

In [20]:
# Make predictions on the test set
y_test_pred_stratified = baseline_stratified.predict(X_test)

# Evaluate the model's performance
accuracy_stratified = accuracy_score(y_test, y_test_pred_stratified)
report_stratified = classification_report(y_test, y_test_pred_stratified)

print(f"Accuracy: {accuracy_stratified}")
print("Classification Report:\n", report_stratified)

Accuracy: 0.7330426645961496
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.84      0.84     42314
         1.0       0.16      0.16      0.16      7915

    accuracy                           0.73     50229
   macro avg       0.50      0.50      0.50     50229
weighted avg       0.73      0.73      0.73     50229



The Distribution baseline has a lower accuracy than the Majority Class baseline. But for the positive class (i.e., diabetes) precision, recall and f1-score are significantly higher, because they are 0 for Majority Class. Correspondingly, recall and f1-score of the negative class (i.e., no-diabetes) decreased from the Majority Class to Distribution baseline.

### Export of Baseline

In [7]:
save_model(baseline_stratified, "stratified")

Initial model saved to '../models/baseline/baseline_model_stratified_20241129_225334.pkl'


# Baseline: Use feature with highest correlation

The feature that has the highest correlation with our target (diabetes) is GenHealth (with 0.29). This correlation is still far below 0.5. Thus, we can be sure that it is no false predictor. 

We pick the feature having the highest correlation with our target (diabetes). 
Then, we calculate the mean of the target variable for each the ordinal values for that selected feature.
Where we find the highest difference/gap between these means, we put our decision boundary.
For this feature's values below that decision boundary, diabetes is predicted.
For this feature's values above that decision boundary, no diabetes is predicted.

In [31]:
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

class OneFeatureModel:
    def __init__(self, X_train, y_train): 
        """
        Initialize with the selected feature and threshold.
        """
        self.fit(X_train, y_train)
        # self.feature = feature
        # self.threshold = threshold

    def fit(self, X_train, y_train):  
        """
        Fit our model on the training data.
        """
        # Calculate correlation of each feature in X_train with y_train
        correlations = X_train.apply(lambda x: x.corr(y_train))
        # Find the feature with the highest correlation
        self.feature = correlations.idxmax()
        print(f"Most correlated feature: {self.feature}")
            
        # Get unique values of this feature
        unique_values = X_train[self.feature].dropna().unique()
        means = {}

        # Combine X_train and y_train into a single DataFrame
        combined_df = pd.concat([X_train, y_train.rename('y_train')], axis=1)
        # Rename negative labels from 0 to -1

        # Group by the unique values in the specified column and calculate the mean of y_train
        # Get correlation with GenHlth for each of its values
        for unique_val in unique_values:
            #correlations = combined_df[genhlth_dummies.columns].corrwith(combined_df['Diabetes_binary'])
            subset = combined_df[combined_df[self.feature] == unique_val]
            mu = subset['y_train'].mean()
            means[unique_val] = mu

        # Sort dictionary of correlations increasingly by its key
        means = dict(sorted(means.items()))
        keys = list(means.keys())
        values = list(means.values())
        print(f"Means (grouped by class): {means}")

        # Find the largest gap
        largest_gap = 0
        indices_largest_gap = (None, None)
        for i in range(len(values) - 1):
            gap = values[i+1] - values[i]
            if gap > largest_gap:
                largest_gap = gap
                indices_largest_gap = (keys[i], keys[i+1])

        # Switch two values of largest gap 
        # so that the first corresponds to no diabetes and the second to diabetes
        if means[indices_largest_gap[0]] > means[indices_largest_gap[1]]:
            helper_switch = indices_largest_gap[0]
            indices_largest_gap[0] = indices_largest_gap[1]
            indices_largest_gap[1] = helper_switch
        print(f"Indices of the largest gap: {indices_largest_gap}\n")
        self.threshold = indices_largest_gap[1]


        # # PCA
        # self.X_train_pca = self.pca.fit_transform(X_train)
        # # K-NN on the PCA-transformed data
        # self.knn.fit(self.X_train_pca, y_train)

    def predict(self, X): 
        """
        Predict labels based on the feature and threshold.
        """
        return X[self.feature].apply(lambda x: 1 if x >= self.threshold else 0)
    

        # # Make prediction on validation set
        # y_val_pred_one_feature = pd.DataFrame({
        #     'Target': X_val[most_correlated_feature].apply(lambda x: 1 if x >= indices_largest_gap[1] else 0)
        # })

    def evaluate(self, X_val, y_val):
        """
        Evaluate the model on validation data.
        """
        y_val_pred = self.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        report = classification_report(y_val, y_val_pred)
        return accuracy, report

In [32]:
# Initialize the model with the feature and threshold from your script
baseline_one_feature = OneFeatureModel(X_train, y_train)
# baseline_one_feature.fit()

Most correlated feature: GenHlth
Means (grouped by class): {1.0: 0.031711541489529205, 2.0: 0.08568687030225491, 3.0: 0.20092068122833273, 4.0: 0.342883087400681, 5.0: 0.4081849274678618}
Indices of the largest gap: (3.0, 4.0)



Evaluate on Validation data:

In [33]:
# Evaluate the model
accuracy, report = baseline_one_feature.evaluate(X_val, y_val)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

Accuracy: 0.7940

Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.87      0.88     21797
         1.0       0.36      0.40      0.38      4078

    accuracy                           0.79     25875
   macro avg       0.62      0.63      0.63     25875
weighted avg       0.80      0.79      0.80     25875



Evaluate on Test data:

In [34]:
# Evaluate the model
accuracy, report = baseline_one_feature.evaluate(X_test, y_test)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

Accuracy: 0.7940

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.87      0.88     42314
         1.0       0.36      0.39      0.37      7915

    accuracy                           0.79     50229
   macro avg       0.62      0.63      0.63     50229
weighted avg       0.80      0.79      0.80     50229



### Export of Baseline

In [35]:
# Save the model to a .pkl file
save_model(baseline_one_feature, "one_feature")

Initial model saved to '../models/baseline/baseline_model_one_feature_20241201_165606.pkl'


# Baseline: KNN on First component of PCA

Is this too good for a baseline? Should we use another (more dummy) classifier than K-NN for this first PCA component?

Not a conventional baseline, but since it is not overly complicated, it is suitable as a baseline. 

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import pickle

class PCANNModel:
    def __init__(self, n_components=1, n_neighbors=5):
        """
        Initialize the PCA and KNN model.
        """
        self.pca = PCA(n_components=n_components)
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    def fit(self, X_train, y_train):
        """
        Fit PCA and KNN on the training data.
        """
        # PCA
        self.X_train_pca = self.pca.fit_transform(X_train)
        # K-NN on the PCA-transformed data
        self.knn.fit(self.X_train_pca, y_train)

    def predict(self, X):
        """
        Predict labels for new data.
        """
        # Transform data using PCA
        X_pca = self.pca.transform(X)
        # Predict target using K-NN
        return self.knn.predict(X_pca)

    def evaluate(self, X, y_true):
        """
        Evaluate the model on validation/test data.
        """
        y_pred = self.predict(X)
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        return accuracy, report

    def explained_variance(self):
        """
        Get the explained variance ratio of the PCA components.
        """
        return self.pca.explained_variance_ratio_

In [37]:
# Create and train the model
baseline_pca_nn = PCANNModel(n_components=1, n_neighbors=5)
baseline_pca_nn.fit(X_train, y_train)

How much of the variance is explained by the first principal component which we use?

In [38]:
explained_variance = baseline_pca_nn.explained_variance()
print(f"Explained variance by the first component: {explained_variance[0]:.4f}")

Explained variance by the first component: 0.4720


Evaluate on Validation data:

In [39]:
# Evaluate the model
accuracy, report = baseline_pca_nn.evaluate(X_val, y_val)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


Accuracy: 0.8167

Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.95      0.90     21797
         1.0       0.26      0.09      0.13      4078

    accuracy                           0.82     25875
   macro avg       0.55      0.52      0.51     25875
weighted avg       0.76      0.82      0.78     25875



Evaluate on Test data:

In [40]:
# Evaluate the model
accuracy, report = baseline_pca_nn.evaluate(X_test, y_test)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

Accuracy: 0.8155

Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.95      0.90     42314
         1.0       0.25      0.08      0.13      7915

    accuracy                           0.82     50229
   macro avg       0.55      0.52      0.51     50229
weighted avg       0.75      0.82      0.78     50229



### Export of Baseline

In [41]:
# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save the model to a .pkl file
with open(f'../models/baseline/baseline_model_pca_nn_{timestamp}.pkl', 'wb') as file:
    pickle.dump(baseline_pca_nn, file)
    print("Model saved to " + f'../models/baseline/baseline_model_pca_nn_{timestamp}.pkl')

Model saved to ../models/baseline/baseline_model_pca_nn_20241201_165612.pkl


In [42]:
# TODO in report: argue why it makes sense -> Explain/use/include the theory of PCA.

Here, we do not plot the explained variance of different numbers of principal components, because using any more than one component seems to be too much for a baseline that should be kept simple.

# Conclusion

On Validation data:

|  | Majority class | Distribution | Highest correlation | KNN on 1st Principal Component |
|----------|----------|----------|----------|----------|
| Accuracy       | 0.84     | 0.74     | 0.79     | 0.82     |
| Precision 0    | 0.84     | 0.84     | 0.89     | 0.85     |
| Precision 1    | 0     | 0.15     | 0.36     | 0.26     |
| Recall 0       | 1     | 0.85     | 0.87     | 0.95     |
| Recall 1       | 0     | 0.14     | 0.40     | 0.09     |
| F1 Score 0     | 0.91     | 0.84     | 0.88     | 0.90     |
| F1 Score 1     | 0     | 0.15     | 0.38     | 0.13     |


On Test data:

|  | Majority class | Distribution | Highest correlation | KNN on 1st Principal Component |
|----------|----------|----------|----------|----------|
| Accuracy       | 0.84     | 0.73     | 0.79     | 0.82     |
| Precision 0    | 0.84     | 0.84     | 0.88     | 0.85     |
| Precision 1    | 0     | 0.16     | 0.36     | 0.25     |
| Recall 0       | 1     | 0.84     | 0.87     | 0.95     |
| Recall 1       | 0     | 0.16     | 0.39     | 0.08     |
| F1 Score 0     | 0.91     | 0.84     | 0.88     | 0.90     |
| F1 Score 1     | 0     | 0.16     | 0.37     | 0.13     |
