<a href="https://colab.research.google.com/github/aai540-group3/project/blob/main/Final_Project_Team_3_Deliverable_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
%%bash
pip install --quiet --upgrade pip
pip install --quiet --progress-bar=off uv
uv pip install --system \
    datasets \
    pandas \
    numpy \
    scikit-learn \
    huggingface-hub[cli,hf_transfer]

Audited 5 packages in 69ms


In [22]:
import pandas as pd

from datasets import load_dataset

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             roc_auc_score)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder,
                                   PolynomialFeatures, StandardScaler)

In [23]:
# Load dataset
dataset = load_dataset("imodels/diabetes-readmission", token=None)
df = dataset['train'].to_pandas()

In [24]:
# Split data
X = df.drop(columns=['readmitted'])
y = df['readmitted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Identify categorical and numerical columns dynamically
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for all columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [26]:
# Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Fit initial model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Initial evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Initial Accuracy: {accuracy * 100:.2f}%")
print(f"Initial Precision: {precision * 100:.2f}%")
print(f"Initial Recall: {recall * 100:.2f}%")
print(f"Initial ROC-AUC Score: {roc_auc * 100:.2f}%")

Initial Accuracy: 62.53%
Initial Precision: 63.01%
Initial Recall: 46.38%
Initial ROC-AUC Score: 61.43%


In [27]:
# Polynomial Features for ALL Available Features
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = interaction.fit_transform(X_train[numerical_cols])

# Convert interaction terms into DataFrame and rename columns to avoid duplication
interaction_columns = [f"interaction_{i}" for i in range(X_interactions.shape[1])]
X_train_interactions = pd.DataFrame(X_interactions, columns=interaction_columns)

# Concatenate original and interaction terms
X_train_with_interactions = pd.concat([X_train.reset_index(drop=True), X_train_interactions], axis=1)

# Retrain the model with all columns and interaction terms
clf.fit(X_train_with_interactions, y_train)
y_pred_improved = clf.predict(X_test)

# Improved evaluation
accuracy_improved = accuracy_score(y_test, y_pred_improved)
precision_improved = precision_score(y_test, y_pred_improved, average='binary')
recall_improved = recall_score(y_test, y_pred_improved, average='binary')
roc_auc_improved = roc_auc_score(y_test, y_pred_improved)

print(f"Improved Accuracy: {accuracy_improved * 100:.2f}%")
print(f"Improved Precision: {precision_improved * 100:.2f}%")
print(f"Improved Recall: {recall_improved * 100:.2f}%")
print(f"Improved ROC-AUC Score: {roc_auc_improved * 100:.2f}%")

# Cross-validation
cross_val_scores = cross_val_score(clf, X_train_with_interactions, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy after improvements: {cross_val_scores.mean() * 100:.2f}%")

Improved Accuracy: 62.53%
Improved Precision: 63.01%
Improved Recall: 46.38%
Improved ROC-AUC Score: 61.43%
Cross-Validation Accuracy after improvements: 62.73%
