In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [16]:
df = pd.read_csv('cancer.csv')

df = df.drop("STDs: Time since first diagnosis", axis=1)
df = df.drop("STDs: Time since last diagnosis", axis=1)

# Replace "?" with actual values
df.replace("?", pd.NA, inplace=True)
# df.dropna(inplace=True)
df = df.fillna(df.median())

for column in df.columns:
    df[column] = pd.to_numeric(df[column])

In [3]:
def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy

In [17]:
y = df['Biopsy']

# Extract numeric columns
numeric_cols = df.select_dtypes(include=['int', 'float']).columns

# Initialize the scaler
scaler = StandardScaler()  # Or MinMaxScaler()

# Scale the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [18]:
# Separate features and target variable
X = df.drop('Biopsy', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        self.predictions = []
        for x_test in X:
            distances = np.sqrt(np.sum((self.X_train - x_test) ** 2, axis=1))
            nearest_neighbors = np.argsort(distances)[:self.k]
            knn_labels = self.y_train[nearest_neighbors].astype(int)
            most_common = np.argmax(np.bincount(knn_labels))
            if most_common > 0.5:
                self.predictions.append(1)
            else:
                self.predictions.append(most_common)
        return self.predictions

In [21]:
# Instantiate the KNN classifier
knn = KNN(10)

# Fit the model to the training data
knn.fit(X_train.values, y_train.values)

# Calculate test accuracy
knn_test_predictions = knn.predict(X_test.values)
knn_test_accuracy = accuracy_score(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy:", knn_test_accuracy)

# Calculate train accuracy
knn_train_predictions = knn.predict(X_train.values)
knn_train_accuracy = accuracy_score(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy:", knn_train_accuracy)

knn_test_accuracy = calculate_accuracy(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy (Alternate Method):", knn_test_accuracy)
knn_train_accuracy = calculate_accuracy(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy (Alternate Method):", knn_train_accuracy)

K-Nearest Neighbors Test Accuracy: 0.9418604651162791
K-Nearest Neighbors Train Accuracy: 0.9431486880466472
K-Nearest Neighbors Test Accuracy (Alternate Method): 0.9418604651162791
K-Nearest Neighbors Train Accuracy (Alternate Method): 0.9431486880466472


In [8]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Value if the node is a leaf node

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or np.max(n_samples_per_class) == n_samples:
            leaf_value = np.argmax(n_samples_per_class)
            return Node(value=leaf_value)

        # Find the best split
        best_gini = np.inf
        best_criteria = None
        best_sets = None
        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]
                gini = self._gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_criteria = (feature_index, threshold)
                    best_sets = (left_indices, right_indices)

        # Create subtrees
        left = self._grow_tree(X[best_sets[0]], y[best_sets[0]], depth + 1)
        right = self._grow_tree(X[best_sets[1]], y[best_sets[1]], depth + 1)
        return Node(feature_index=best_criteria[0], threshold=best_criteria[1], left=left, right=right)

    def _gini(self, *groups):
        total_samples = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in range(self.n_classes):
                p = [np.sum(group == class_val) / size for group in groups]
                score += p[class_val] ** 2
            gini += (1.0 - score) * (size / total_samples)
        return gini

    def _predict(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_index]
        subtree = tree.right
        if feature_value <= tree.threshold:
            subtree = tree.left
        return self._predict(x, subtree)

    def predict(self, X):
        return [self._predict(x, self.tree) for x in X]

In [9]:
# Instantiate the DecisionTree classifier
decision_tree = DecisionTree(max_depth=2)

# Fit the model to the training data
decision_tree.fit(X_train.values, y_train.values)

In [10]:
# Calculate test accuracy
dtc_test_predictions = decision_tree.predict(X_test.values)
dtc_test_accuracy = accuracy_score(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy:", dtc_test_accuracy)

# Calculate train accuracy
dtc_train_predictions = decision_tree.predict(X_train.values)
dtc_train_accuracy = accuracy_score(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy:", dtc_train_accuracy)

dtc_test_accuracy = calculate_accuracy(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy (Alternate Method):", dtc_test_accuracy)
dtc_train_accuracy = calculate_accuracy(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy (Alternate Method):", dtc_train_accuracy)

Decision Tree Classification Test Accuracy: 0.9534883720930233
Decision Tree Classification Train Accuracy: 0.9329446064139941
Decision Tree Classification Test Accuracy (Alternate Method): 0.9534883720930233
Decision Tree Classification Train Accuracy (Alternate Method): 0.9329446064139941


In [11]:
class GaussianNaiveBayesClassifier:
    def __init__(self):
        self.class_probabilities = {}
        self.feature_parameters = {}

    def fit(self, X, y):
        self.class_probabilities = dict(y.value_counts(normalize=True))

        for label in self.class_probabilities:
            label_data = X[y == label]
            self.feature_parameters[label] = {}
            for feature in X.columns:
                mean = label_data[feature].mean()
                std = label_data[feature].std()
                self.feature_parameters[label][feature] = (mean, std)

    def _calculate_probability(self, x, mean, std):
        exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            probabilities = {}
            for label in self.class_probabilities:
                probabilities[label] = self.class_probabilities[label]
                for feature in self.feature_parameters[label]:
                    mean, std = self.feature_parameters[label][feature]
                    probabilities[label] *= self._calculate_probability(row[feature], mean, std)
            predictions.append(max(probabilities, key=probabilities.get))
        return predictions

In [12]:
# Instantiate and train the classifier
classifier = GaussianNaiveBayesClassifier()
classifier.fit(X_train, y_train)

# Calculate test accuracy
gnb_test_predictions = classifier.predict(X_test)
gnb_test_accuracy = accuracy_score(gnb_test_predictions, y_test.values)

# Calculate train accuracy
gnb_train_predictions = classifier.predict(X_train)
gnb_train_accuracy = accuracy_score(gnb_train_predictions, y_train.values)


  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent


In [13]:
print("Gaussian Naive Bayes Test Accuracy:", gnb_test_accuracy)
print("Gaussian Naive Bayes Train Accuracy:", gnb_train_accuracy)

gnb_test_accuracy = calculate_accuracy(gnb_test_predictions, y_test.values)
gnb_train_accuracy = calculate_accuracy(gnb_train_predictions, y_train.values)
print("Gaussian Naive Bayes Test Accuracy (Alternate Method):", gnb_test_accuracy)
print("Gaussian Naive Bayes Train Accuracy (Alternate Method):", gnb_train_accuracy)

Gaussian Naive Bayes Test Accuracy: 0.9534883720930233
Gaussian Naive Bayes Train Accuracy: 0.9314868804664723
Gaussian Naive Bayes Test Accuracy (Alternate Method): 0.9534883720930233
Gaussian Naive Bayes Train Accuracy (Alternate Method): 0.9314868804664723


In [14]:
# K-Nearest Neighbors:
knn_test_predictions = knn.predict(X_test.values)
knn_test_accuracy = accuracy_score(knn_test_predictions, y_test.values)
print("K-Nearest Neighbors Test Accuracy:", knn_test_accuracy)

# Calculate train accuracy
knn_train_predictions = knn.predict(X_train.values)
knn_train_accuracy = accuracy_score(knn_train_predictions, y_train.values)
print("K-Nearest Neighbors Train Accuracy:", knn_train_accuracy)

# knn_test_accuracy = calculate_accuracy(knn_test_predictions, y_test.values)
# print("K-Nearest Neighbors Test Accuracy (Alternate Method):", knn_test_accuracy)
# knn_train_accuracy = calculate_accuracy(knn_train_predictions, y_train.values)
# print("K-Nearest Neighbors Train Accuracy (Alternate Method):", knn_train_accuracy)


# Decision Tree Classification:
# Calculate test accuracy
dtc_test_predictions = decision_tree.predict(X_test.values)
dtc_test_accuracy = accuracy_score(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy:", dtc_test_accuracy)

# Calculate train accuracy
dtc_train_predictions = decision_tree.predict(X_train.values)
dtc_train_accuracy = accuracy_score(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy:", dtc_train_accuracy)

# dtc_test_accuracy = calculate_accuracy(dtc_test_predictions, y_test.values)
# print("Decision Tree Classification Test Accuracy (Alternate Method):", dtc_test_accuracy)
# dtc_train_accuracy = calculate_accuracy(dtc_train_predictions, y_train.values)
# print("Decision Tree Classification Train Accuracy (Alternate Method):", dtc_train_accuracy)


# Gaussian Naive Bayes:
# Calculate test accuracy
gnb_test_predictions = classifier.predict(X_test)
gnb_test_accuracy = accuracy_score(gnb_test_predictions, y_test.values)

# Calculate train accuracy
gnb_train_predictions = classifier.predict(X_train)
gnb_train_accuracy = accuracy_score(gnb_train_predictions, y_train.values)
print("Gaussian Naive Bayes Test Accuracy:", gnb_test_accuracy)
print("Gaussian Naive Bayes Train Accuracy:", gnb_train_accuracy)

# gnb_test_accuracy = calculate_accuracy(gnb_test_predictions, y_test.values)
# gnb_train_accuracy = calculate_accuracy(gnb_train_predictions, y_train.values)
# print("Gaussian Naive Bayes Test Accuracy (Alternate Method):", gnb_test_accuracy)
# print("Gaussian Naive Bayes Train Accuracy (Alternate Method):", gnb_train_accuracy)

K-Nearest Neighbors Test Accuracy: 0.9476744186046512
K-Nearest Neighbors Train Accuracy: 0.9460641399416909
Decision Tree Classification Test Accuracy: 0.9534883720930233
Decision Tree Classification Train Accuracy: 0.9329446064139941


  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent
  exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
  return (1 / (np.sqrt(2 * np.pi) * std)) * exponent


Gaussian Naive Bayes Test Accuracy: 0.9534883720930233
Gaussian Naive Bayes Train Accuracy: 0.9314868804664723
