# Task 4: Binary Classification - Baseline Model (Logistic Regression)

This notebook loads the preprocessed data saved by `1_consolidate_data.ipynb` and trains/evaluates a baseline Logistic Regression model.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # For potential NaN handling if needed, though imputation should handle it

# Define input path
input_parquet_path = 'data/preprocessed_data.parquet'

# Load the preprocessed data
print(f"Loading preprocessed data from {input_parquet_path}...")
try:
    df = pd.read_parquet(input_parquet_path)
    print("Data loaded successfully.")
    print("\nLoaded DataFrame Info:")
    df.info()
    print("\nLoaded DataFrame Head:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {input_parquet_path}. Please run notebook 1 first.")
    # Optionally raise an error or exit
    # raise
except ImportError:
    print("\nError: 'pyarrow' or 'fastparquet' package is required to read Parquet format.")
    print("Please install it using: pip install pyarrow")
    # raise
except Exception as e:
    print(f"\nAn error occurred while loading the Parquet file: {e}")
    # raise

In [None]:
# Separate train and test sets based on the 'split' column
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# Separate features (X) and target (y)
# Drop the 'split' column and the target 'Class' column from features
X_train_scaled = train_df.drop(['Class', 'split'], axis=1)
y_train = train_df['Class']

X_test_scaled = test_df.drop(['Class', 'split'], axis=1)
y_test = test_df['Class']

# Convert target variable 'Class' from object ('n'/'y') to numeric (0/1) if necessary
# Check if conversion is needed
if y_train.dtype == 'object':
    print("\nConverting target variable 'Class' to numeric (n=0, y=1)...")
    y_train = y_train.map({'n': 0, 'y': 1})
    y_test = y_test.map({'n': 0, 'y': 1})
    print("Target variable converted.")
    print("y_train value counts:\n", y_train.value_counts())
    print("y_test value counts:\n", y_test.value_counts())

print(f"\nTraining features shape: {X_train_scaled.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test_scaled.shape}")
print(f"Test target shape: {y_test.shape}")

# Evaluation Function

Define a reusable function to display evaluation metrics and plot the confusion matrix.

In [None]:
def evaluate_model(y_true, y_pred, model, model_name):
    """Calculates, prints, and plots evaluation metrics for a binary classifier."""
    
    print(f"\n--- {model_name} Evaluation ---")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    # Use target_names=['n', 'y'] if y_true contains 0 and 1 corresponding to 'n' and 'y'
    # Adjust if your mapping is different or if y_true is already 'n'/'y'
    target_names = ['Class n (0)', 'Class y (1)'] if np.all(np.isin(y_true.unique(), [0, 1])) else None
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=model.classes_ if hasattr(model, 'classes_') else ['0', '1'], 
                yticklabels=model.classes_ if hasattr(model, 'classes_') else ['0', '1'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

# Train and Evaluate Logistic Regression Model

Using the scaled and imputed data loaded from the parquet file.

In [None]:
# 1. Instantiate Model
log_reg = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for potential convergence issues

# 2. Train Model
print("\nTraining Logistic Regression model...")
log_reg.fit(X_train_scaled, y_train)
print("Model training complete.")

# 3. Make Predictions
y_pred_lr = log_reg.predict(X_test_scaled)
# y_pred_proba_lr = log_reg.predict_proba(X_test_scaled)[:, 1] # Probabilities (optional for this function)

# 4. Evaluate Model using the function
evaluate_model(y_test, y_pred_lr, log_reg, "Logistic Regression")