# Import data

In [3]:
import os
import sys

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

data_loader = DataLoader()
X_train, y_train = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

# Baseline: Majority class

The majority class for our dataset is "no diabetes".

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the support vector machine model
baseline_majority = DummyClassifier(strategy="most_frequent")

# Train the model on the preprocessed training data
baseline_majority.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_majority = baseline_majority.predict(X_val)

# Evaluate the model's performance
accuracy_majority = accuracy_score(y_val, y_val_pred_majority)
report_majority = classification_report(y_val, y_val_pred_majority)

print(f"Validation Accuracy: {accuracy_majority}")
print("Classification Report:\n", report_majority)

Validation Accuracy: 0.842439293598234
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     21371
         1.0       0.00      0.00      0.00      3997

    accuracy                           0.84     25368
   macro avg       0.42      0.50      0.46     25368
weighted avg       0.71      0.84      0.77     25368



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
# same as majority class, just with distributions

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the support vector machine model
baseline_prior = DummyClassifier(strategy="prior")

# Train the model on the preprocessed training data
baseline_prior.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_prior = baseline_prior.predict(X_val)

# Evaluate the model's performance
accuracy_prior = accuracy_score(y_val, y_val_pred_prior)
report_prior = classification_report(y_val, y_val_pred_prior)

print(f"Validation Accuracy: {accuracy_prior}")
print("Classification Report:\n", report_prior)

Validation Accuracy: 0.842439293598234
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     21371
         1.0       0.00      0.00      0.00      3997

    accuracy                           0.84     25368
   macro avg       0.42      0.50      0.46     25368
weighted avg       0.71      0.84      0.77     25368



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Baseline: Distribution

In [79]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the support vector machine model
baseline_stratified = DummyClassifier(strategy="stratified")

# Train the model on the preprocessed training data
baseline_stratified.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_stratified = baseline_stratified.predict(X_val)

# Evaluate the model's performance
accuracy_stratified = accuracy_score(y_val, y_val_pred_stratified)
report_stratified = classification_report(y_val, y_val_pred_stratified)

print(f"Validation Accuracy: {accuracy_stratified}")
print("Classification Report:\n", report_stratified)

Validation Accuracy: 0.7375433617155471
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.85      0.84     21371
         1.0       0.16      0.16      0.16      3997

    accuracy                           0.74     25368
   macro avg       0.50      0.50      0.50     25368
weighted avg       0.74      0.74      0.74     25368



# Baseline: Use feature with highest correlation

The feature that has the highest correlation with our target (diabetes) is GenHealth.

Seems to have same accuracy as the baselines before. So redundant? Why include or why not?

In [82]:
# print(X_train.head())
# type(X_train) # dataframe

# TODO: train parameter: automatically:
#       ... find value with highest correlation (e.g., GenHlth)
#       ... find threshold (e.g., 0.3)

X_train_GenHlth = X_train[['GenHlth']]

def train_with_one_feature(X_train, y_train):
    # Calculate correlation of each feature in X_train with y_train
    correlations = X_train.apply(lambda x: x.corr(y_train))
    
    # Find the feature with the highest correlation
    most_correlated_feature = correlations.idxmax()
    
    # Find threshold
    threshold = 0.3 # TODO if we include this classifier
    
    return most_correlated_feature, threshold

most_correlated_feature, threshold = train_with_one_feature(X_train, y_train)

def predict_with_one_feature(df, feature, threshold):
    y_hat = pd.DataFrame({
        'Target': df[feature].apply(lambda x: 1 if x < threshold else 0)
    })
    return y_hat

# Make predictions on the validation set
y_val_pred_one_feature = predict_with_one_feature(X_val, most_correlated_feature, threshold)

# Evaluate the model's performance
accuracy_one_feature = accuracy_score(y_val, y_val_pred_one_feature)
report_one_feature = classification_report(y_val, y_val_pred_one_feature)

print(f"Validation Accuracy: {accuracy_one_feature}")
print("Classification Report:\n", report_one_feature)

Validation Accuracy: 0.842439293598234
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     21371
         1.0       0.00      0.00      0.00      3997

    accuracy                           0.84     25368
   macro avg       0.42      0.50      0.46     25368
weighted avg       0.71      0.84      0.77     25368



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Baseline: First component of PCA

Is this too good for a baseline? Should we use another (more dummy) classifier than K-NN for this first PCA component?

In [34]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Apply PCA to reduce dimensions (you can choose the number of components, here it's set to 2 for visualization)
pca = PCA(n_components=1)  # just the first component (it's a baseline)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_val)

# copy from exercise 5
def predict_knn(X_train, X_test, y_train, y_test):

    # Initialize the K-NN classifier (with k=5, you can adjust the number of neighbors)
    knn = KNeighborsClassifier(n_neighbors=5)
    
    # Train the model
    knn.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = knn.predict(X_test)
    
    # Measure performance
    accuracy = accuracy_score(y_test, y_pred)
    
    # Output results
    print(f"Accuracy: {accuracy:.4f}")
    
    # Detailed classification report
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

predict_knn(X_train_pca, X_test_pca, y_train, y_test)

Accuracy: 0.8059

Classification Report:

              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89     21370
         1.0       0.16      0.06      0.08      3998

    accuracy                           0.81     25368
   macro avg       0.50      0.50      0.49     25368
weighted avg       0.74      0.81      0.76     25368

