In [4]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv("/content/drive/MyDrive/breast_data.csv")
X = df.drop(columns=["diagnosis", "Unnamed: 32", "id"])
y = df["diagnosis"]

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# -------------------------
# Preprocessing
# -------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------
# PCA
# -------------------------
pca = PCA(n_components=0.95)  # retain 95% variance
X_pca = pca.fit_transform(X_scaled)
print(f"Original features: {X.shape[1]}, Reduced features: {X_pca.shape[1]}")

# -------------------------
# Models + Hyperparameters
# -------------------------
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
import xgboost as xgb

models = {
    "SVM": (SVC(), {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale", "auto"]}),
    "NaiveBayes": (GaussianNB(), {}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}),
    "LogisticRegression": (LogisticRegression(max_iter=500), {"C": [0.1, 1, 10]}),
    "DecisionTree": (DecisionTreeClassifier(), {"max_depth": [3, 5, None]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [50, 100], "max_depth": [None, 10]}),
    "AdaBoost": (AdaBoostClassifier(), {"n_estimators": [50, 100]}),
    "GradientBoosting": (GradientBoostingClassifier(), {"n_estimators": [50, 100]}),
    "XGBoost": (xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
                {"n_estimators": [50, 100], "max_depth": [3, 5]}),
}

# -------------------------
# Helper function to run CV
# -------------------------
def evaluate_models(X, y, label):
    results = []
    for name, (model, param_grid) in models.items():
        grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X, y)
        best_model = grid.best_estimator_
        scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')
        results.append({
            "Model": name,
            "Best Params": grid.best_params_,
            "Mean Accuracy": np.mean(scores),
            "Std Dev": np.std(scores)
        })
        print(f"{label} - {name}: Best Params={grid.best_params_}, Accuracy={np.mean(scores):.4f}")
    return pd.DataFrame(results)

# -------------------------
# Evaluate No-PCA and With-PCA
# -------------------------
print("\n=== Evaluating Without PCA ===")
results_no_pca = evaluate_models(X_scaled, y_encoded, "No-PCA")

print("\n=== Evaluating With PCA ===")
results_with_pca = evaluate_models(X_pca, y_encoded, "With-PCA")

# -------------------------
# Save Results
# -------------------------
final_results = pd.merge(results_no_pca, results_with_pca,
                         on="Model", suffixes=("_NoPCA", "_PCA"))
final_results.to_csv("experiment6_results.csv", index=False)
print("\nSaved results to experiment6_results.csv")
print(final_results)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Original features: 30, Reduced features: 10

=== Evaluating Without PCA ===
No-PCA - SVM: Best Params={'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}, Accuracy=0.9772
No-PCA - NaiveBayes: Best Params={}, Accuracy=0.9279
No-PCA - KNN: Best Params={'n_neighbors': 7, 'weights': 'uniform'}, Accuracy=0.9684
No-PCA - LogisticRegression: Best Params={'C': 1}, Accuracy=0.9807
No-PCA - DecisionTree: Best Params={'max_depth': 5}, Accuracy=0.9156
No-PCA - RandomForest: Best Params={'max_depth': None, 'n_estimators': 50}, Accuracy=0.9543
No-PCA - AdaBoost: Best Params={'n_estimators': 100}, Accuracy=0.9772
No-PCA - GradientBoosting: Best Params={'n_estimators': 50}, Accuracy=0.9561


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


No-PCA - XGBoost: Best Params={'max_depth': 5, 'n_estimators': 50}, Accuracy=0.9737

=== Evaluating With PCA ===
With-PCA - SVM: Best Params={'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Accuracy=0.9772
With-PCA - NaiveBayes: Best Params={}, Accuracy=0.9279
With-PCA - KNN: Best Params={'n_neighbors': 7, 'weights': 'uniform'}, Accuracy=0.9701
With-PCA - LogisticRegression: Best Params={'C': 1}, Accuracy=0.9789
With-PCA - DecisionTree: Best Params={'max_depth': 5}, Accuracy=0.9244
With-PCA - RandomForest: Best Params={'max_depth': None, 'n_estimators': 50}, Accuracy=0.9490
With-PCA - AdaBoost: Best Params={'n_estimators': 100}, Accuracy=0.9648
With-PCA - GradientBoosting: Best Params={'n_estimators': 100}, Accuracy=0.9490


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


With-PCA - XGBoost: Best Params={'max_depth': 5, 'n_estimators': 100}, Accuracy=0.9649

Saved results to experiment6_results.csv
                Model                             Best Params_NoPCA  \
0                 SVM  {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}   
1          NaiveBayes                                            {}   
2                 KNN      {'n_neighbors': 7, 'weights': 'uniform'}   
3  LogisticRegression                                      {'C': 1}   
4        DecisionTree                              {'max_depth': 5}   
5        RandomForest       {'max_depth': None, 'n_estimators': 50}   
6            AdaBoost                         {'n_estimators': 100}   
7    GradientBoosting                          {'n_estimators': 50}   
8             XGBoost          {'max_depth': 5, 'n_estimators': 50}   

   Mean Accuracy_NoPCA  Std Dev_NoPCA  \
0             0.977177       0.008921   
1             0.927930       0.020359   
2             0.968359       0.008971

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
