# Breast Cancer Detection Project

This project aims to detect breast cancer using Machine Learning. We will analyze the dataset, train multiple models, compare their performance, and build a prediction system.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Set plot style
plt.style.use('ggplot')

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('data.csv')

# detailed info
print("Shape of dataset:", df.shape)
print("\nIs Null Sum:", df.isnull().sum().max())

# View first 5 rows
df.head()

In [None]:
# Drop unnecessary columns
# 'id' is not useful for prediction, and 'Unnamed: 32' often appears in this dataset as an artifact
columns_to_drop = ['id', 'Unnamed: 32']
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(columns=[col])

print("Columns after dropping:", df.columns)
df.info()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Diagnosis Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='diagnosis', data=df, palette='magma')
plt.title('Distribution of Diagnosis (M=Malignant, B=Benign)')
plt.show()

In [None]:
# Encode the diagnosis to numerical values for correlation analysis
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])
# M -> 1, B -> 0 (Usually)

plt.figure(figsize=(20, 18))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## 3. Data Preprocessing

In [None]:
# Split Features and Target
X = df.drop(columns=['diagnosis'], axis=1)
y = df['diagnosis']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Training Set Shape:", X_train.shape)
print("Test Set Shape:", X_test.shape)

## 4. Model Training and Evaluation

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier()
}

results = {}

print("Model Training & Evaluation:\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc*100:.2f}%")
    # print(confusion_matrix(y_test, y_pred))

## 5. Model Comparison Graph

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette='viridis')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1.1)  # Scale from 0 to 1
for i, v in enumerate(results.values()):
    plt.text(i, v + 0.02, f"{v*100:.2f}%", ha='center', fontweight='bold')
plt.show()

## 6. Prediction System

In [12]:
# We will use Random Forest for the final prediction system as it's usually robust
final_model = models["Random Forest"]

def predict_cancer(input_data):
    # input_data should be a list or numpy array of feature values
    # Change input input_data to numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # Reshape the numpy array as we are predicting for one datapoint
    input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

    # Standardize the input data (using the same scaler fitted on training data)
    std_data = scaler.transform(input_data_reshaped)
    
    prediction = final_model.predict(std_data)
    
    if prediction[0] == 0:
        return "The person does not have Breast Cancer (Benign)"
    else:
        return "The person has Breast Cancer (Malignant)"


# Example Usage (Taking a row from the dataset for testing)
# Note: In a real scenario, you would input new numbers here.
print("--- Prediction Test ---")
sample_input = X.iloc[0].values  # First row features
print("Input values:", sample_input)
result = predict_cancer(sample_input)
print("Prediction Result:", result)
print("Actual Label:", "Malignant" if y.iloc[0] == 1 else "Benign")

--- Prediction Test ---
Input values: [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
Prediction Result: The person has Breast Cancer (Malignant)
Actual Label: Malignant




In [11]:
# User Provided Values for Prediction
user_input_values = [10.75, 14.97, 68.26, 355.3, 0.07793, 0.05139, 0.02251, 0.007875, 0.1399, 0.05688, 
                     0.2525, 1.239, 1.806, 17.74, 0.006547, 0.01781, 0.02018, 0.005612, 0.01671, 0.00236, 
                     11.95, 20.72, 77.79, 441.2, 0.1076, 0.1223, 0.09755, 0.03413, 0.23, 0.06769]

print("--- User Input Prediction ---")
try:
    user_result = predict_cancer(user_input_values)
    print("Prediction Result for User Input:", user_result)
except Exception as e:
    print("Error in prediction:", e)
    print("Please ensure the length of input values matches the number of features trained on (30).")

--- User Input Prediction ---
Prediction Result for User Input: The person does not have Breast Cancer (Benign)


