In [1]:
import warnings
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
pd.set_option('future.no_silent_downcasting', True)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df = pd.read_csv("wdbc.data")
column_names = [
    "ID", "Diagnosis", "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1", 
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1", 
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2", 
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2", 
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3", 
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
df.columns = column_names
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'wdbc.data'

In [None]:
df.drop(["ID"], axis = 1, inplace = True)

In [None]:
print(f"The dataframe consists of {df.shape[0]} rows and {df.shape[1]} columns.")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["Diagnosis"] = df["Diagnosis"].replace({"M": 1, "B": -1}) # Tumor Maligno = 1, Tumor Benigno = -1

In [None]:
df["Diagnosis"].value_counts()

##### As we can see, in the dataset there are 211 bad cases and 357 good cases.

In [None]:
import seaborn as sns
corr_matrix = df.corr()

plt.figure(figsize = (20, 8))
sns.heatmap(corr_matrix, annot = True)
plt.plot()

In [None]:
def remove_high_corr(corr_matrix, threshold):
    pos = []
    columns = corr_matrix.columns
    
    for i in range(corr_matrix.shape[0]):
        for j in range(i + 1, corr_matrix.shape[1]):
            if np.abs(corr_matrix.iloc[i, j]) >= threshold :
                pos.append((columns[i], columns[j], corr_matrix.iloc[i, j]))
    return pos

high_corr_col = remove_high_corr(corr_matrix = corr_matrix, threshold = 0.95)

In [None]:
for c1, c2, val in high_corr_col:
    print(f"The columns {c1} and {c2} has a high correlation of : {round(val, 3)}.")

In [None]:
def label_corr(corr_matrix, threshold):
    pos = []
    columns  = corr_matrix.columns
    label_col = corr_matrix.columns.get_loc("Diagnosis")
    for i in range(corr_matrix.shape[0]):
        if (np.abs(corr_matrix.iloc[i, label_col]) >= threshold)  and (i != label_col):
           pos.append((columns[i], corr_matrix.iloc[i, label_col]))
    return pos

label_high = label_corr(corr_matrix = corr_matrix, threshold = 0.75)

for c1, val in label_high:
    print(f"The column {c1} has a big correlation with the label of {round(val, 3)}")

In [None]:
# Randomizing my model.

df = df.sample(frac = 1).reset_index(drop = True)
df

# SVM CODE USING SKLEARN :

In [None]:
labels = np.array(df["Diagnosis"])
features = np.array(df.drop(["Diagnosis"], axis = 1))

In [None]:
# Scaling the features : 
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
#scaler = StandardScaler()

features = scaler.fit_transform(features)

### Now i will build a classifier :

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3)

y_train = pd.to_numeric(y_train, errors='coerce')
y_test = pd.to_numeric(y_test, errors='coerce')

In [None]:
labels_train, counts_train = np.unique(y_train, return_counts = True)

for key, value in zip(labels_train, counts_train):
    if key == 1:
        freq1 = value
    else:
        freq2 = value
    print(f"In class {key} has {value} samples.")

In [None]:
labels_test, counts_test = np.unique(y_test, return_counts = True)

for key, value in zip(labels_test, counts_test):
    print(f"In class {key} has {value} samples.")

In [None]:
import time
# Building my SVM classifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm = SVC()
c_grid = np.arange(start = 0.01, stop = 2, step = 0.01) # C values for Grid.
kern_type = ["linear", "poly", "rbf", "sigmoid"] # Kernel types for grid
parameters = {"C" : c_grid, "kernel" : kern_type}

start_time = time.time()

GS = GridSearchCV(svm, param_grid = parameters, cv = 10) # Creating my own grid.
GS.fit(X_train, y_train)

end_time = time.time()
print(f"The best accuracy was {GS.best_score_} with the parameters {GS.best_params_}")

In [None]:
best_C = GS.best_params_["C"]
best_Kernel = GS.best_params_["kernel"]

In [None]:
print(f"Execution time: {end_time - start_time} seconds")

In [None]:
best_svm = SVC(C = best_C, kernel = best_Kernel, probability = True) # Creating a SVM with the best parameters.
best_svm.fit(X_train, y_train)
pred_val_svm = best_svm.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm_svm = confusion_matrix(y_test, pred_val_svm)
display = ConfusionMatrixDisplay(confusion_matrix = cm_svm)
display.plot()
plt.show()

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, pred_val_svm)
print(report)


In [None]:
from sklearn.metrics import roc_curve, auc

pred_proba = best_svm.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, pred_proba)

roc_auc = auc(fpr, tpr)

In [None]:
pred_proba

In [None]:
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.grid()
plt.show()


# KNN with code i did by hand: 

In [None]:
def gaussian_kernel(distance, bandwidth=1.0):
    distance = np.asarray(distance)  # Ensure distance is an array
    return np.exp(- (distance ** 2) / (2 * bandwidth ** 2)) / np.sqrt(2 * np.pi)

In [None]:
def knn_altered_classify(matrix, freq1, freq2, labels, new_points, k, h):
    ret = []
    
    # Check input shapes
    if matrix.shape[0] != labels.shape[0]:
        raise ValueError("The number of samples in the matrix and labels must match.")
    
    for new_point in new_points:
        # Calculate the distances to all points
        distances = np.linalg.norm(matrix - new_point, axis=1)

        # Create weights based on class frequencies
        class_labels = np.unique(labels)
        weights = np.zeros_like(distances)

        # Assign weights based on the class frequencies
        for label in class_labels:
            mask = labels == label
            weights[mask] = freq1 if label == 1 else freq2  # Use appropriate frequencies based on the label

        # Apply frequency weights to distances
        distances *= weights
        
        # Get the indices of the k nearest neighbors
        nearest_indices = np.argsort(distances)[:k]
        
        # Calculate weights using the Gaussian kernel
        nearest_points = distances[nearest_indices]
        kernel_weights = gaussian_kernel(distance=nearest_points, bandwidth=h)

        # Get the labels of the nearest points
        points = labels[nearest_indices]

        # Calculate weighted sum of the labels
        weighted_sum = np.sum(points * kernel_weights)

        # Classify based on weighted sum
        ret.append(1 if weighted_sum >= 0 else -1)

    return ret

In [None]:
def CV_knn(df, CV, k, h):
    size_samples = df.shape[0] // CV
    df = df.sample(frac=1, random_state=0).reset_index(drop=True)

    # Convert feature columns to numeric
    df.iloc[:, :-1] = df.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')

    y = df["Diagnosis"].values
    df.drop(columns=["Diagnosis"], inplace=True)
    acc = 0

    try:
        for i in range(CV):
            test_indices = range(size_samples * i, size_samples * (i + 1))
            y_test = y[test_indices]
            X_test = df.iloc[test_indices, :].values
            
            # Create train sets by excluding test indices
            y_train = np.delete(y, test_indices, axis=0)
            X_train = df.drop(test_indices).values
            
            labels_train, counts_train = np.unique(y_train, return_counts=True)
            
            freq1, freq2 = 0, 0
            for key, value in zip(labels_train, counts_train):
                if key == 1:
                    freq1 = value
                elif key == -1:
                    freq2 = value
            
            # Ensure the classifier function returns valid predictions
            pred = np.array(knn_altered_classify(matrix=X_train, freq1=freq1, freq2=freq2, labels=y_train,
                                                  new_points=X_test, k=k, h=h))
            


            pred = pred.astype(int)
            y_test = y_test.astype(int)
            acc += accuracy_score(y_test, pred)

        return acc / CV
            
    except Exception as error:
        print(f"The error {error} is happening.")
        raise


In [None]:
start = time.time()
k = 25
h = 1000
arr_GS = []
for i in range(1, k + 1):
    arr_GS.append(CV_knn(df = df, CV = 10, k = i, h = h))

max_pos = 0
for i in range(len(arr_GS)):
    if arr_GS[i] >= arr_GS[max_pos]:
        max_pos = i

finish = time.time()
print(f"Using k = {max_pos} I had the best accuracy that was: {arr_GS[max_pos]}")

In [None]:
print(f"The time was {finish - start}")

In [None]:
pred_knn = knn_altered_classify(matrix = X_train, freq1 = freq1, freq2 = freq2, labels = y_train, new_points = X_test,
                                k = max_pos, h = h)

In [None]:
print(f"A acurácia do knn é {accuracy_score(y_test, pred_knn)}")

In [None]:
cm_knn = confusion_matrix(y_test, pred_knn)
display = ConfusionMatrixDisplay(confusion_matrix = cm_knn)
display.plot()
plt.show()

In [None]:
report = classification_report(y_test, pred_knn)
print(report)