# Notebook 02 — Simple Model (Logistic Regression)

Train a simple model and evaluate it.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

df = pd.read_csv('../data/sample/diabetes_sample.csv')

y = (df['Diabetes_012'] != 0).astype(int)
X = df.drop(columns=['Diabetes_012'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced')),
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print('ROC-AUC:', round(roc_auc_score(y_test, y_prob), 4))
print('\n', classification_report(y_test, y_pred))

In [None]:
# ROC curve
plt.figure()
RocCurveDisplay.from_predictions(y_test, y_prob)
plt.title('ROC Curve')
plt.show()

### Quick note
Because the dataset is imbalanced, focus more on **F1** and **ROC-AUC** than accuracy.