**Phases 6-7: preprocessing, and training**

## **Phase 6: Data Preprocessing**

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [34]:
df = pd.read_csv("../data/processed/cleaned_data.csv")

### Remove useless columns

In [35]:
# Drop id column (not useful for the model) then check the info
df.drop(['id'], axis=1, inplace=True)

### Prepare data for training

In [None]:
# Separate What You Want to Predict
X = df.drop('diagnosis', axis=1)  # Everything EXCEPT 'diagnosis' column
y = df['diagnosis']               # ONLY the 'diagnosis' column

# Note:
# Why X and y?
# This comes from math notation: y = f(X) means "y depends on X"
# X: Independent variables (patient info)
# y: Dependent variable (diagnosis)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Handle class imbalance

In [37]:
# Check if your data is imbalanced
print("\nPercentages:")
print(y_train.value_counts(normalize=True) * 100)

# Conclusion:
# This is not too imbalanced (close to 60/40), so SMOTE or oversampling might not be essential, but you can still try it and see if recall improves.


Percentages:
diagnosis
0    62.637363
1    37.362637
Name: proportion, dtype: float64


## Scale numerical features

In [38]:
# Notes:
# SMOTE = "Synthetic Minority Oversampling TEchnique"
# What it means: "I'm a tool that creates fake examples of rare cases"

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## **Phase 7: Model Training**

In [43]:
print("Training Logistic Regression model...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

print("Training SVM model...")
svm_model = SVC(kernel='linear', random_state=42, probability=True)
svm_model.fit(X_train_scaled, y_train)

print("Training XGBoost model...")
# Calculate class weight for imbalanced data
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
xgb_model = XGBClassifier(
    scale_pos_weight=pos_weight,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train_scaled, y_train)
# Note: xgb model handle imbalanced data better internally through scale_pos_weight

print("Training Random Forest model...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Handles imbalance
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Create voting classifier with your existing models
print("Training Voting Classifier...")
voting_model = VotingClassifier([
    ('lr', lr_model),       # Logistic Regression
    ('svm', svm_model),     # SVM
    ('rf', rf_model)        # Random Forest
    # Note: Excluding XGBoost because it uses different data format
], voting='soft')  # 'soft' uses probabilities, 'hard' uses direct votes
voting_model.fit(X_train_scaled, y_train)

print("Model training completed!")

Training Logistic Regression model...
Training SVM model...
Training XGBoost model...
Training Random Forest model...
Training Voting Classifier...
Model training completed!


In [46]:
import joblib
from pathlib import Path

joblib.dump(X_train_scaled, "../data/processed/X_train_scaled.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(X_test_scaled, "../data/processed/X_test_scaled.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")

# Create models folder if it doesn't exist
Path("../models").mkdir(parents=True, exist_ok=True)

# Save each model
joblib.dump(lr_model, "../models/logistic_regression.pkl")
joblib.dump(svm_model, "../models/svm.pkl")
joblib.dump(rf_model, "../models/random_forest.pkl")
joblib.dump(xgb_model, "../models/xgboost.pkl")
joblib.dump(voting_model, "../models/voting_classifier.pkl")

print("All models saved successfully!")

All models saved successfully!
