In [2]:
from google.colab import drive
drive.mount('/content/drive')


# Import libraries
import pandas as pd
import numpy as np
import pickle  # For saving the model
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/CSC/breast_cancer_train.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")

Dataset loaded successfully.


In [4]:
selected_features = [
    'radius_mean',      # Size
    'texture_mean',     # Surface texture
    'perimeter_mean',   # Shape/Size
    'smoothness_mean',  # Surface irregularity
    'concavity_mean'    # Shape contour
]
target = 'diagnosis'

# Create the subset DataFrame
X = df[selected_features]
y_raw = df[target]

print(f"\nSelected Features: {selected_features}")

# B. Encode Target Variable
# The diagnosis is usually 'M' (Malignant) or 'B' (Benign).
# We must convert this to 1 (Malignant) and 0 (Benign).
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Print mapping to be sure
print(f"Target Encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# C. Handle Missing Values
# (This specific dataset usually has no missing values in these columns,
# but we fill with mean just in case to be robust)
if X.isnull().sum().sum() > 0:
    print("Missing values detected. Filling with mean.")
    X = X.fillna(X.mean())

# D. Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


model_pipeline = Pipeline([
    ('scaler', StandardScaler()),       # Step 1: Scale features to Mean=0, Std=1
    ('svm', SVC(kernel='rbf', random_state=42)) # Step 2: SVM Classifier
])


Selected Features: ['radius_mean', 'texture_mean', 'perimeter_mean', 'smoothness_mean', 'concavity_mean']
Target Encoding: {'B': np.int64(0), 'M': np.int64(1)}


In [5]:
print("\nTraining Support Vector Machine (SVM)...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")


Training Support Vector Machine (SVM)...
Training complete.


In [7]:
print("\n" + "="*30)
print("MODEL EVALUATION")
print("="*30)

y_pred = model_pipeline.predict(X_test)

# Calculate Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {acc:.2%}")
print(f"Precision: {prec:.2%}")
print(f"Recall:    {rec:.2%}")
print(f"F1-Score:  {f1:.2%}")
print("-" * 30)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))


# SAVE THE MODEL (Pickle)

save_path = '/content/drive/MyDrive/CSC/breast_cancer_model.pkl'

# Save using Pickle
with open(save_path, 'wb') as file:
    pickle.dump(model_pipeline, file)

print(f"\nModel saved successfully to: {save_path}")


print("\nVerifying model reload...")

# Reload the model
with open(save_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Test with a single patient from the test set
sample_patient = X_test.iloc[[0]]
prediction_code = loaded_model.predict(sample_patient)[0]
prediction_label = label_encoder.inverse_transform([prediction_code])[0]

print(f"Prediction for test sample: {prediction_label} (Code: {prediction_code})")
print("Verification successful.")


MODEL EVALUATION
Accuracy:  95.61%
Precision: 95.12%
Recall:    92.86%
F1-Score:  93.98%
------------------------------
Classification Report:

              precision    recall  f1-score   support

      Benign       0.96      0.97      0.97        72
   Malignant       0.95      0.93      0.94        42

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


Model saved successfully to: /content/drive/MyDrive/CSC/breast_cancer_model.pkl

Verifying model reload...
Prediction for test sample: B (Code: 0)
Verification successful.
