Abdul Raheman (MSCS-634-B01) ProjectDeliverable_3

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_scaled = scaler.fit_transform(X)
print("Data prepared successfully")

Data prepared successfully


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
acc_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)
y_prob_dt = dt.predict_proba(X_test_scaled)[:, 1]
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
auc_dt = auc(fpr_dt, tpr_dt)

print(f"DT Accuracy: {acc_dt:.4f}")
print(f"DT F1 Score: {f1_dt:.4f}")
print("DT Confusion Matrix:\n", cm_dt)
print(f"DT AUC: {auc_dt:.4f}")

# SVM
svc = SVC(probability=True, random_state=42)
svc.fit(X_train_scaled, y_train)
y_pred_svc = svc.predict(X_test_scaled)
acc_svc = accuracy_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)
cm_svc = confusion_matrix(y_test, y_pred_svc)
y_prob_svc = svc.predict_proba(X_test_scaled)[:, 1]
fpr_svc, tpr_svc, _ = roc_curve(y_test, y_prob_svc)
auc_svc = auc(fpr_svc, tpr_svc)

print(f"SVC Accuracy: {acc_svc:.4f}")
print(f"SVC F1 Score: {f1_svc:.4f}")
print("SVC Confusion Matrix:\n", cm_svc)
print(f"SVC AUC: {auc_svc:.4f}")

DT Accuracy: 0.9474
DT F1 Score: 0.9577
DT Confusion Matrix:
 [[40  3]
 [ 3 68]]
DT AUC: 0.9440
SVC Accuracy: 0.9825
SVC F1 Score: 0.9861
SVC Confusion Matrix:
 [[41  2]
 [ 0 71]]
SVC AUC: 0.9974


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, scoring='f1')
grid.fit(X_train_scaled, y_train)

best_svc = grid.best_estimator_
y_pred_best = best_svc.predict(X_test_scaled)
acc_best = accuracy_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
cm_best = confusion_matrix(y_test, y_pred_best)
y_prob_best = best_svc.predict_proba(X_test_scaled)[:, 1]
fpr_best, tpr_best, _ = roc_curve(y_test, y_prob_best)
auc_best = auc(fpr_best, tpr_best)

print(f"Best Params: {grid.best_params_}")
print(f"Tuned SVC Accuracy: {acc_best:.4f}")
print(f"Tuned SVC F1 Score: {f1_best:.4f}")
print("Tuned SVC Confusion Matrix:\n", cm_best)
print(f"Tuned SVC AUC: {auc_best:.4f}")

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Tuned SVC Accuracy: 0.9825
Tuned SVC F1 Score: 0.9861
Tuned SVC Confusion Matrix:
 [[41  2]
 [ 0 71]]
Tuned SVC AUC: 0.9974


In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score

# KMeans with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Metrics
sil_score = silhouette_score(X_scaled, cluster_labels)
ari = adjusted_rand_score(y, cluster_labels)

print(f"KMeans Silhouette Score: {sil_score:.4f}")
print(f"KMeans ARI: {ari:.4f}")

# PCA for visualization description
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
centers_pca = pca.transform(kmeans.cluster_centers_)

print("Cluster centers in PCA space:\n", centers_pca)

# Cluster distribution
unique, counts = np.unique(cluster_labels, return_counts=True)
print("Cluster sizes:", dict(zip(unique, counts)))

KMeans Silhouette Score: 0.3447
KMeans ARI: 0.6765
Cluster centers in PCA space:
 [[ 4.40952193 -0.07888523]
 [-2.1758271   0.03892499]]
Cluster sizes: {np.int32(0): np.int64(188), np.int32(1): np.int64(381)}


In [None]:
# Select top 5 features correlated with target (absolute)
corr = X.corrwith(y).abs().sort_values(ascending=False)[:6]  # Extra to avoid index issues
top_features = corr.index.tolist()
print("Top features:", top_features)

# Bin into high/low based on median
df = X.copy()
df['target'] = y
for feat in top_features:
    median = df[feat].median()
    df[feat + '_high'] = (df[feat] > median).astype(int)

# Bin target: malignant if target==0
df['malignant'] = (y == 0).astype(int)  # 1 if malignant

# Transactional columns: high feats and malignant
trans_cols = [f + '_high' for f in top_features] + ['malignant']
transactions = df[trans_cols]

# Adjust min_support to 0.3 to include malignant (~0.37)
min_support = 0.3
supports = transactions.mean()
freq_single = supports[supports >= min_support]
print("Frequent singletons:\n", freq_single)

# For pairs
from itertools import combinations
pairs = list(combinations(trans_cols, 2))
pair_supports = {}
for col1, col2 in pairs:
    support = (transactions[col1] & transactions[col2]).mean()
    if support >= min_support:
        pair_supports[(col1, col2)] = support
print("Frequent pairs:\n", pair_supports)

# For rules
rules = []
min_conf = 0.7
for (ante, cons), supp in pair_supports.items():
    conf = supp / supports[ante]
    if conf >= min_conf:
        lift = conf / supports[cons]
        rules.append(f"{ante} => {cons}: support={supp:.4f}, conf={conf:.4f}, lift={lift:.4f}")

    conf_rev = supp / supports[cons]
    if conf_rev >= min_conf:
        lift_rev = conf_rev / supports[ante]
        rules.append(f"{cons} => {ante}: support={supp:.4f}, conf={conf_rev:.4f}, lift={lift_rev:.4f}")

# Filter rules involving 'malignant'
malignant_rules = [r for r in rules if 'malignant' in r]
print("High confidence rules involving malignant:\n", malignant_rules)

Top features: ['worst concave points', 'worst perimeter', 'mean concave points', 'worst radius', 'mean perimeter', 'worst area']
Frequent singletons:
 worst concave points_high    0.499121
worst perimeter_high         0.499121
mean concave points_high     0.499121
worst radius_high            0.497364
mean perimeter_high          0.499121
worst area_high              0.499121
malignant                    0.372583
dtype: float64
Frequent pairs:
 {('worst concave points_high', 'worst perimeter_high'): np.float64(0.421792618629174), ('worst concave points_high', 'mean concave points_high'): np.float64(0.45694200351493847), ('worst concave points_high', 'worst radius_high'): np.float64(0.4147627416520211), ('worst concave points_high', 'mean perimeter_high'): np.float64(0.4024604569420035), ('worst concave points_high', 'worst area_high'): np.float64(0.4147627416520211), ('worst concave points_high', 'malignant'): np.float64(0.35676625659050965), ('worst perimeter_high', 'mean concave poin

Rules show strong links between high feature values and cancer (for example, "worst radius_high => malignant" with conf=0.7279, lift=1.9537â€”almost twice as likely).  It is more likely that reverse rules will work (for example,'malignant => worst radius_high' conf=0.9717), which means that most malignant cases have high severity features.

  For example, if a tumor has a high worst perimeter, a biopsy should be done as soon as possible (support=0.3603).  In healthcare AI, rules can make things easier to understand. This helps doctors find risk factors that lead to faster diagnoses and more personalized care. By pointing out high-lift combinations early, rules may also lower mortality.