# BREAST CANCER DETECTION USING ADABOOST

## Import necessary libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

## Load the dataset

In [None]:
url="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns=['ID','Diagnosis']+[f'Feature_{i}' for i in range(1,31)]
data=pd.read_csv(url,header=None,names=columns)

data

## Display the first 5 and last 5 rows, shape, and basic info of the dataset

In [None]:
data.head()

In [None]:
 data.tail() 

In [None]:
 {data.shape}

In [None]:
data.info() 

In [None]:
data.describe()

In [None]:
data.describe().T

## Check for duplicate rows

In [None]:
# Check for duplicate rows
num_total_rows = len(data) 
num_duplicate_rows = data.duplicated().sum() 
percentage_duplicates = (num_duplicate_rows / num_total_rows) * 100 
print(f"Number of Rows with Duplicates: {num_duplicate_rows}") 
print(f"Percentage of Duplicate Rows: {percentage_duplicates:.2f}%")

## Count unique values for each column

In [None]:
# Count unique values for each column 
unique_counts = data.nunique()
# Display the unique counts 
print(unique_counts)

## Drop non-numeric columns (e.g., 'id')

In [None]:
data.drop('ID', axis=1, inplace=True)
print(data.columns)

## Check for Missing Values

In [None]:
# Check for Missing Values
print("Missing values in each column:\n", data.isnull().sum())

## Handle Missing Values: Filling missing values only in numeric columns

In [None]:
# Handle Missing Values: Filling missing values with the mean for numeric columns only
numeric_cols = data.select_dtypes(include=[np.number]).columns  # Select numeric columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

## Visualize the count of Benign vs Malignant cases before splitting, outlier removal and scaling

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Diagnosis', data=data)
plt.title('Count of Benign and Malignant Cases')
plt.xlabel('Diagnosis (0: Benign, 1: Malignant)')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Benign', 'Malignant'])  # Set x-tick labels
plt.show()

Encode Labels: Encoding 'diagnosis' column (1 for Malignant, 0 for Benign)

## Splitting Data

In [None]:
X = data.drop('Diagnosis', axis=1)  # Features
y = data['Diagnosis']  # Target variable

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the splits
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")

## Outlier Detection: Using IQR

In [None]:
def detect_outliers_iqr(df, columns=None):
    """
    Detects outliers in a dataframe using the IQR method.

    Parameters:
    - df (pd.DataFrame): The dataset to process.
    - columns (list, optional): List of specific columns to check for outliers. 
                                If None, checks all numeric columns.

    Returns:
    - pd.DataFrame: Rows from the dataframe that are identified as outliers.
    """
    outliers = pd.DataFrame()
    
    # If no specific columns are provided, select all numeric columns
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns

    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Identify outliers in the current column
        outliers_in_column = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outliers = pd.concat([outliers, outliers_in_column], axis=0)

    return outliers.drop_duplicates()



In [None]:
# Detect outliers
outliers = detect_outliers_iqr(data, columns=X.columns)
print(f"Number of outliers detected: {len(outliers)}")
# Filter out rows identified as outliers
outliers = outliers[outliers.index.isin(data.index)]  # Ensure indices match
data = data.drop(outliers.index, axis=0)             # Drop outlier rows

# Print the cleaned dataset
print(f"Number of rows after removing outliers: {data.shape[0]}")


In [None]:
print(data)

## Scaling Features

In [None]:
# Scale the features
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler (do not fit again on test data)
X_test_scaled = scaler.transform(X_test)


### Display the first few rows of the scaled features

In [None]:
# Check the first few rows of the scaled data (optional)
print(pd.DataFrame(X_train_scaled, columns=X_train.columns).head())

### Visualize the count of Benign vs Malignant cases after splitting, outlier removal and scaling

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Diagnosis', data=data)
plt.title('Count of Benign and Malignant Cases')
plt.xlabel('Diagnosis (0: Benign, 1: Malignant)')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Benign', 'Malignant'])  # Set x-tick labels
plt.show()

Retrieves the 25th percentile (Q1) for each feature. Retrieves the 75th percentile (Q3) for each feature. IQR is the difference between the third quartile (Q3) and first quartile (Q1).


## Smote

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

In [None]:
# Apply SMOTE to the scaled training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Print class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

## Implementing Adaboost Algorithm

In [None]:
# Define the AdaBoost model with a DecisionTreeClassifier as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)


### Fit the AdaBoost classifier on the training data and Make predictions on the test set

In [None]:
# Train the model on the SMOTE-augmented training data
adaboost.fit(X_train_smote, y_train_smote)

In [None]:
y_pred = adaboost.predict(X_test_scaled)

## Evaluating Model Performance

### Accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

### Precision

In [None]:
precision = precision_score(y_test, y_pred, pos_label='M')  # 'M' for malignant
print(f"Precision (Malignant): {precision:.4f}")

### Recall

In [None]:
recall = recall_score(y_test, y_pred, pos_label='M')      # 'M' for malignant
print(f"Recall (Malignant): {recall:.4f}")

### F1-score

In [None]:
f1 = f1_score(y_test, y_pred, pos_label='M')              # 'M' for malignant
print(f"F1 Score (Malignant): {f1:.4f}")

### Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred, labels=['B', 'M'])
print("Confusion Matrix:")
print(conf_matrix)


## Hyperparameter Tuning 

### Define the Hyperparameter Grid:

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200],  # The number of weak learners
    'learning_rate': [0.01, 0.1, 0.5, 1.0]  # Learning rate
}


### GridSearchCV:

In [None]:
# Define the AdaBoost model
adaboost = AdaBoostClassifier(estimator=base_estimator, random_state=42)

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='accuracy', verbose=1)

# Fit the grid search
grid_search.fit(X_train_smote, y_train_smote)

# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters found by Grid Search:", best_params)


### Re-train the Model Using Best Hyperparameters:

In [None]:
# Re-train the AdaBoost model with the best parameters
best_adaboost = grid_search.best_estimator_

# Fit the best model on the SMOTE-augmented training data
best_adaboost.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred = best_adaboost.predict(X_test_scaled)


### Evaluate the Tuned Model:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary', pos_label='M')
recall = recall_score(y_test, y_pred, average='binary', pos_label='M')
f1 = f1_score(y_test, y_pred, average='binary', pos_label='M')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


### Plot the Confusion Matrix:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['B', 'M'], yticklabels=['B', 'M'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
