Phase 1

Importing Libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
%pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB

Loading Dataset


In [None]:
dataset = pd.read_csv('breast-cancer.csv')
print(dataset.head())

Data Preprocessing

In [None]:
# 1 Removing the ID column
dataset.drop('id', axis=1, inplace=True)

# 2 checking for missing values
print(dataset.isnull().sum())

In [None]:
# 3 Encoding the categorical data
label_encoder = LabelEncoder()
dataset['diagnosis'] = label_encoder.fit_transform(dataset['diagnosis'])
print(dataset['diagnosis'])

In [None]:
# 4 Feature scaling
features = dataset.columns.drop('diagnosis')
scaler = StandardScaler()
dataset[features] = scaler.fit_transform(dataset[features])
print(dataset.head())

Detecting Outliers

In [43]:
# Calculating the Interquartile Range
Q1 = dataset[features].quantile(0.25)
Q2 = dataset[features].quantile(0.75)
IQR = Q2 - Q1

In [None]:
# Removing the outliers
outliers = dataset[(dataset[features] < (Q1 - 1.5 * IQR)) | (dataset[features] > (Q2 + 1.5 * IQR))].any(axis=1)
cleaned_data = dataset[~outliers]
print(f'Original data size: {len(dataset)}, Cleaned data size: {len(cleaned_data)}')
print('no of outliers:', len(dataset) - len(cleaned_data))

Visualization

In [None]:
# Visualization of the distribution of features (Histograms for each feature)
for feature in features:
    plt.figure(figsize=(10, 4))
    sns.histplot(cleaned_data[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Visualization of the correlation between features
plt.figure(figsize=(20, 15))
correlation_matrix = cleaned_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Scatter plot to visualize relationships between two variables in the cleaned dataset
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cleaned_data, x='area_mean', y='texture_mean', hue='diagnosis')
plt.title('Relationship Between Area Mean and Texture Mean (Cleaned)')
plt.xlabel('Area Mean')
plt.ylabel('Texture Mean')
plt.legend(title='Diagnosis', labels=['Benign', 'Malignant'])
plt.show()


Phase 2

Importing Libraries

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn_extra.cluster import KMedoids


import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


Splitting the Dataset

In [53]:
X = cleaned_data.drop('diagnosis', axis=1)
y = cleaned_data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Decision Tree

In [54]:


def train_decision_tree(X_train, X_test, y_train, y_test):
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Initialize Decision Tree classifier
    clf = DecisionTreeClassifier(random_state=42, criterion='entropy')

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_clf = grid_search.best_estimator_

    # Print best parameters
    print("Best Parameters:", best_params)

    # Predict on the test set
    y_pred = best_clf.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print performance metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    # Plot ROC curve
    y_pred_proba = best_clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Plot decision tree
    plt.figure(figsize=(15, 10))
    plot_tree(best_clf, filled=True, feature_names=X_train.columns, class_names=['Benign', 'Malignant'])
    plt.show()

    # Feature importance
    feature_importance = best_clf.feature_importances_
    sorted_indices = np.argsort(feature_importance)[::-1]
    sorted_features = X_train.columns[sorted_indices]
    sorted_importance = feature_importance[sorted_indices]

    plt.figure(figsize=(10, 8))
    sns.barplot(x=sorted_importance, y=sorted_features, palette='viridis')
    plt.xlabel('Feature Importance')
    plt.ylabel('Features')
    plt.title('Feature Importance Plot')
    plt.show()

    # Return best classifier
    return best_clf

In [None]:
best_clf = train_decision_tree(X_train, X_test, y_train, y_test)

KNN Classifier

In [57]:
def knn(X_train, X_test, y_train, y_test):
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2],
        'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
    }

    # Initialize KNN classifier
    knn = KNeighborsClassifier()

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_knn = grid_search.best_estimator_

    # Print best parameters
    print("Best Parameters:", best_params)

    # Predict on the test set
    y_pred = best_knn.predict(X_test)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, ['B', 'M'])
    plt.yticks(tick_marks, ['B', 'M'])

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.show()

    # Calculate ROC/AUC
    y_scores = best_knn.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Plot learning curve
    train_sizes, train_scores, test_scores = learning_curve(best_knn, X_train, y_train, cv=5, scoring='accuracy')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title("Learning Curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

    

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print performance metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    

    # Return best classifier
    return best_knn, roc_auc

In [None]:
knn_classifier = knn(X_train, X_test, y_train, y_test)

Naive Bayes


In [60]:


def train_naive_bayes(X_train, X_test, y_train, y_test):
   

    # Initialize Gaussian Naive Bayes classifier
    nb_classifier = GaussianNB()

    # Train the classifier
    nb_classifier.fit(X_train, y_train)

    # Predict on the test set
    y_pred = nb_classifier.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

     # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, ['Benign', 'Malignant'])
    plt.yticks(tick_marks, ['Benign', 'Malignant'])

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.show()

    labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    values = [accuracy, precision, recall, f1]

    plt.figure(figsize=(8, 6))
    plt.bar(labels, values, color=['blue', 'green', 'red', 'purple'])
    plt.title('Performance Metrics')
    plt.xlabel('Metrics')
    plt.ylabel('Score')
    plt.ylim(0, 1)  # Set y-axis limit from 0 to 1
    plt.show()
    # Print performance metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    

    # Return trained classifier
    return nb_classifier



In [None]:
nb_classifier = train_naive_bayes(X_train, X_test, y_train, y_test)


SVM

In [62]:


def svm(X_train, X_test, y_train, y_test):
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    }

    # Initialize SVM classifier
    svm_classifier = SVC()

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_svm = grid_search.best_estimator_

    # Print best parameters
    print("Best Parameters:", best_params)

    # Predict on the test set
    y_pred = best_svm.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print performance metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    
       # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, ['B', 'M'])
    plt.yticks(tick_marks, ['B', 'M'])

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.show()
    # Return best classifier
    return best_svm



In [None]:
svm_classifier = svm(X_train, X_test, y_train, y_test)


Logistic Regression

In [64]:



def logistic_regression(X_train, X_test, y_train, y_test):
    model = LogisticRegression(random_state=0,max_iter=10000)

    # Train the classifier on the training data
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    y_true_str = ["Benign" if label == 0 else "Malignant" for label in y_test]
    y_pred_str = ["Benign" if label == 0 else "Malignant" for label in y_pred]


    # Evaluate the performance of the classifier
    print("Accuracy:", accuracy_score(y_true_str, y_pred_str))
    print("Precision:", precision_score(y_true_str, y_pred_str, pos_label="Malignant"))
    print("Recall:", recall_score(y_true_str, y_pred_str, pos_label="Malignant"))
    print("F1 Score:", f1_score(y_true_str, y_pred_str, pos_label="Malignant"))


    # Generate a confusion matrix plot
    cm = confusion_matrix(y_true_str, y_pred_str, labels=["Benign", "Malignant"])
    cm_labels = {"Benign": "Benign", "Malignant": "Malignant"}
    fig, ax = plt.subplots(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=cm_labels.values(), yticklabels=cm_labels.values(),
                cbar=False, annot_kws={"fontsize": 12}, linewidths=.5, linecolor='lightgray')
    cbar = ax.figure.colorbar(ax.collections[0])
    cbar.ax.tick_params(labelsize=12)
    plt.title('Confusion Matrix', fontsize=14)
    plt.xlabel('Predicted label', fontsize=12)
    plt.ylabel('True label', fontsize=12)

    # Generate an ROC curve plot
    y_scores = model.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    fig2, ax2 = plt.subplots(figsize=(6, 6))
    ax2.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax2.set_xlim([0.0, 1.0])
    ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('Receiver Operating Characteristic (ROC) Curve')
    ax2.legend(loc="lower right")


    # Generate a feature importance plot (for models with coefficients available)
    if hasattr(model, 'coef_'):
        coefs = model.coef_.ravel()
        if len(coefs) == 5:
            names = ['symmetry_se', 'smoothness_mean', 'texture_se', 'symmetry_worst', 'compactness_se']
        else:
            names = X_train.columns
        plt.figure(figsize=(10, 6))
        plt.bar(names, coefs)
        plt.title('Feature Importance')
        plt.ylabel('Coefficient Value')
        plt.xticks(rotation=45)
        plt.show()

    # Show all plots
    fig2, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=cm_labels.values(), yticklabels=cm_labels.values(),
                cbar=False, annot_kws={"fontsize": 12}, linewidths=.5, linecolor='lightgray', ax=ax1)
    ax1.set_title('Confusion Matrix', fontsize=14)
    ax1.set_xlabel('Predicted label', fontsize=12)
    ax1.set_ylabel('True label', fontsize=12)
    ax2.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax2.set_xlim([0.0, 1.0])
    ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('Receiver Operating Characteristic (ROC) Curve')
    ax2.legend(loc="lower right")
    plt.show()

    
    return model


In [None]:
logistic_regression(X_train, X_test, y_train, y_test)


Random Forest

In [None]:

def random_forest(X_train, X_test, y_train, y_test):
    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Initialize Random Forest classifier
    rf = RandomForestClassifier(random_state=42)

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_rf = grid_search.best_estimator_

    # Print best parameters
    print("Best Parameters:", best_params)

    # Predict on the test set
    y_pred = best_rf.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print performance metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, ['B', 'M'])
    plt.yticks(tick_marks, ['B', 'M'])

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.show()

    #plot feature importance
    importances = best_rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.bar(range(X_train.shape[1]), importances[indices])
    plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
    plt.title("Feature Importance")
    plt.show()


    # Return best classifier
    return best_rf

In [None]:
random_forest(X_train, X_test, y_train, y_test)