**Phases 6-7: preprocessing, and training**

## **Phase 6: Data Preprocessing**

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
df = pd.read_csv("../data/processed/cleaned_data.csv")

### Remove useless columns

In [17]:
# Drop id column (not useful for the model) then check the info
df.drop(['id'], axis=1, inplace=True)

### Prepare data for training

In [20]:
# Separate What You Want to Predict
X = df.drop('diagnosis', axis=1)  # Everything EXCEPT 'diagnosis' column
y = df['diagnosis']               # ONLY the 'diagnosis' column

# Note:
# Why X and y?
# This comes from math notation: y = f(X) means "y depends on X"
# X: Independent variables (patient info)
# y: Dependent variable (diagnosis)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Handle class imbalance

In [None]:
# Check if your data is imbalanced
print("\nPercentages:")
print(y_train.value_counts(normalize=True) * 100)

# Conclusion:
# This is not too imbalanced (close to 60/40), so SMOTE or oversampling might not be essential, but you can still try it and see if recall improves.


Percentages:
diagnosis
0    62.637363
1    37.362637
Name: proportion, dtype: float64


## Scale numerical features

In [27]:
# Notes:
# SMOTE = "Synthetic Minority Oversampling TEchnique"
# What it means: "I'm a tool that creates fake examples of rare cases"

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)