In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
label_encoder = LabelEncoder()
file_path = r"D:\Coding_Dev\College\SEM_6\ML\Assets\breast_cancer_wisconsin_original\breast-cancer-wisconsin.data"
columns = ["Feature0","Feature1", "Feature2", "Feature3", "Feature4", "Feature5", "Feature6", "Feature7", "Feature8", "Feature9", "Class"]
data = pd.read_csv(file_path, header=None, sep=",", names=columns)
data.drop("Feature0", axis=1, inplace=True)
data.replace('?', pd.NA, inplace=True)
data.dropna(inplace=True)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [58]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

In [59]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed
knn_classifier.fit(train_data.drop("Class", axis=1), train_data["Class"])

In [60]:
from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(train_data.drop("Class", axis=1), train_data["Class"])

In [61]:
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()
nb_classifier.fit(train_data.drop("Class", axis=1), train_data["Class"])


In [62]:
from sklearn.metrics import confusion_matrix

# Define a function to calculate and print confusion matrix
def evaluate_classifier(classifier, test_data):
    X_test = test_data.drop("Class", axis=1)
    y_test = test_data["Class"]
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    return cm

# Evaluate each classifier
knn_cm = evaluate_classifier(knn_classifier, test_data)
tree_cm = evaluate_classifier(tree_classifier, test_data)
nb_cm = evaluate_classifier(nb_classifier, test_data)

# Print confusion matrices
print("Confusion Matrix for K Nearest Neighbor:")
print(knn_cm)
print("\nConfusion Matrix for Univariate Decision Tree:")
print(tree_cm)
print("\nConfusion Matrix for Naive Bayes Gaussian Classifier:")
print(nb_cm)

Confusion Matrix for K Nearest Neighbor:
[[125   2]
 [  7  71]]

Confusion Matrix for Univariate Decision Tree:
[[124   3]
 [  9  69]]

Confusion Matrix for Naive Bayes Gaussian Classifier:
[[123   4]
 [  3  75]]


In [63]:
# Function to calculate and print evaluation metrics
def evaluate_metrics(classifier_name, y_true, y_pred, positive_label):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=positive_label)
    recall = recall_score(y_true, y_pred, pos_label=positive_label)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1 = f1_score(y_true, y_pred, pos_label=positive_label)

    print(f"\nMetrics for {classifier_name}:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Specificity:", specificity)
    print("F1-Score:", f1)

#Accuracy: (TP + TN) / (TP + TN + FP + FN) - Measures overall correctness of predictions.

# Precision (Positive Predictive Value): TP / (TP + FP) - Measures how many of the predicted positive cases were actually positive.

# Recall (Sensitivity, True Positive Rate): TP / (TP + FN) - Measures how many of the actual positive cases were correctly predicted by the classifier.

# Specificity (True Negative Rate): TN / (TN + FP) - Measures how many of the actual negative cases were correctly predicted by the classifier.

# F1-Score: 2 * (Precision * Recall) / (Precision + Recall) - A harmonic mean of precision and recall, useful for balancing the trade-off between false positives and false negatives.

positive_label = 2

evaluate_metrics("K Nearest Neighbor", y_true_knn, y_pred_knn, positive_label)
evaluate_metrics("Univariate Decision Tree", y_true_tree, y_pred_tree, positive_label)
evaluate_metrics("Naïve Bayes Gaussian Classifier", y_true_nb, y_pred_nb, positive_label)



Metrics for K Nearest Neighbor:
Accuracy: 0.9560975609756097
Precision: 0.946969696969697
Recall: 0.984251968503937
Specificity: 0.984251968503937
F1-Score: 0.9652509652509653

Metrics for Univariate Decision Tree:
Accuracy: 0.9317073170731708
Precision: 0.9124087591240876
Recall: 0.984251968503937
Specificity: 0.984251968503937
F1-Score: 0.9469696969696969

Metrics for Naïve Bayes Gaussian Classifier:
Accuracy: 0.9658536585365853
Precision: 0.9761904761904762
Recall: 0.968503937007874
Specificity: 0.968503937007874
F1-Score: 0.9723320158102767


In [64]:
misclassified_samples = []

def find_misclassified_samples(classifier, test_data):
    X_test = test_data.drop("Class", axis=1)
    y_test = test_data["Class"]
    y_pred = classifier.predict(X_test)
    print(f"Total indices: {len(y_test)}")
    for i in range(len(y_test)):
        if y_test.iloc[i] != y_pred[i]:
            misclassified_samples.append({
                "Index": i,
                "True Class": y_test.iloc[i],
                "Predicted Class": y_pred[i]
            })
    return len(misclassified_samples)

# Find misclassified samples for each classifier
print("KNN Classifier:")
knn_mis_samples = find_misclassified_samples(knn_classifier, test_data)
print(f"Misclassified samples in KNN Classifier: {knn_mis_samples}\n")
print("Univariate decision tree Classifier:")
tree_mis_samples = find_misclassified_samples(tree_classifier, test_data)
print(f"Misclassified samples in Univariate Decision Tree Classifier: {tree_mis_samples - knn_mis_samples}\n")
print("Naive Bayes Classifier:")
nb_mis_samples = find_misclassified_samples(nb_classifier, test_data)
print(f"Misclassified samples in Naive Bayes Classifier: {nb_mis_samples - tree_mis_samples}\n")

# Print misclassified samples
print("\nMisclassified Samples:")
for sample in misclassified_samples:
    print(sample)

KNN Classifier:
Total indices: 205
Misclassified samples in KNN Classifier: 9

Univariate decision tree Classifier:
Total indices: 205
Misclassified samples in Univariate Decision Tree Classifier: 12

Naive Bayes Classifier:
Total indices: 205
Misclassified samples in Naive Bayes Classifier: 7


Misclassified Samples:
{'Index': 14, 'True Class': 4, 'Predicted Class': 2}
{'Index': 56, 'True Class': 4, 'Predicted Class': 2}
{'Index': 67, 'True Class': 4, 'Predicted Class': 2}
{'Index': 93, 'True Class': 4, 'Predicted Class': 2}
{'Index': 97, 'True Class': 2, 'Predicted Class': 4}
{'Index': 105, 'True Class': 4, 'Predicted Class': 2}
{'Index': 145, 'True Class': 4, 'Predicted Class': 2}
{'Index': 170, 'True Class': 4, 'Predicted Class': 2}
{'Index': 186, 'True Class': 2, 'Predicted Class': 4}
{'Index': 14, 'True Class': 4, 'Predicted Class': 2}
{'Index': 33, 'True Class': 4, 'Predicted Class': 2}
{'Index': 56, 'True Class': 4, 'Predicted Class': 2}
{'Index': 78, 'True Class': 4, 'Predicte

In [65]:
# Create sets to store indices of misclassified samples by each classifier
knn_misclassified_indices = set(sample["Index"] for sample in misclassified_samples if sample["Predicted Class"] != sample["True Class"] and sample["Predicted Class"] == 2)
tree_misclassified_indices = set(sample["Index"] for sample in misclassified_samples if sample["Predicted Class"] != sample["True Class"] and sample["Predicted Class"] == 2)
nb_misclassified_indices = set(sample["Index"] for sample in misclassified_samples if sample["Predicted Class"] != sample["True Class"] and sample["Predicted Class"] == 2)


# Find the common misclassified indices
common_misclassified_indices = knn_misclassified_indices.intersection(tree_misclassified_indices, nb_misclassified_indices)

common_misclassified_list = []
for sample in misclassified_samples:
    if sample["Index"] in common_misclassified_indices:
        common_misclassified_list.append(sample)

# Print common misclassified samples
print("Common Misclassified Samples by All Classifiers:")
for sample in common_misclassified_list:
    print(f"Index: {sample['Index']}, True Class: {sample['True Class']}, Predicted Class: {sample['Predicted Class']}")


Common Misclassified Samples by All Classifiers:
Index: 14, True Class: 4, Predicted Class: 2
Index: 56, True Class: 4, Predicted Class: 2
Index: 67, True Class: 4, Predicted Class: 2
Index: 93, True Class: 4, Predicted Class: 2
Index: 105, True Class: 4, Predicted Class: 2
Index: 145, True Class: 4, Predicted Class: 2
Index: 170, True Class: 4, Predicted Class: 2
Index: 14, True Class: 4, Predicted Class: 2
Index: 33, True Class: 4, Predicted Class: 2
Index: 56, True Class: 4, Predicted Class: 2
Index: 78, True Class: 4, Predicted Class: 2
Index: 81, True Class: 4, Predicted Class: 2
Index: 93, True Class: 4, Predicted Class: 2
Index: 105, True Class: 4, Predicted Class: 2
Index: 112, True Class: 4, Predicted Class: 2
Index: 145, True Class: 4, Predicted Class: 2
Index: 14, True Class: 4, Predicted Class: 2
Index: 93, True Class: 4, Predicted Class: 2
Index: 105, True Class: 4, Predicted Class: 2


In [73]:
filtered_data = data[data['Class'].isin([2])]
num_rows_with_class_2 = len(filtered_data)
print("Number of Rows with Class Label 2:", num_rows_with_class_2)

filtered_data = data[data['Class'].isin([4])]
num_rows_with_class_4 = len(filtered_data)
print("Number of Rows with Class Label 4:", num_rows_with_class_4)

Number of Rows with Class Label 2: 444
Number of Rows with Class Label 4: 239
