In [15]:
# KNN

In [2]:
import numpy as np
from collections import Counter

In [6]:
# This function calculates the straight-line distance between two points
def euclidean_distance(x1, x2):
    # 1. Subtraction: Find the difference between coordinates
    # 2. Square: Ensure values are positive and penalize larger differences
    # 3. Sum: Add all the squared differences together
    # 4. Sqrt: Take the square root to get the final distance
    return np.sqrt(np.sum((x1 - x2) ** 2))

class KNN:
    def __init__(self, k=3):
        # Store the number of neighbors (k) we want to look at.
        # If k=3, we look at the 3 closest points to make a decision.
        self.k = k

    def fit(self, X, y):
        # In KNN, "training" is just storing the data.
        # X_train are the features (e.g., height/weight)
        # y_train are the labels (e.g., 'Athlete' or 'Non-Athlete')
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # This takes a list of multiple new points and predicts labels for each
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # 1. Compute the distance between the new point 'x'
        # and every single point in our training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # 2. Sort the distances and get the indices of the 'k' smallest ones
        # np.argsort returns the positions (indices) of the values in sorted order
        k_indices = np.argsort(distances)[:self.k]

        # 3. Use those indices to find the actual labels (classes) of those neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # 4. Find the most common label among the neighbors (Majority Vote)
        # Counter(...).most_common(1) returns a list like [('label', count)]
        most_common = Counter(k_nearest_labels).most_common(1)

        # Return the label itself (the first element of the first tuple)
        return most_common[0][0]

In [8]:
# Linear Regression

In [7]:
class LinearRegression:
    def __init__(self, lr=0.001, n_iters=1000):
        # lr (Learning Rate): How big of a step we take during optimization
        self.lr = lr
        # n_iters: How many times we loop through the data to "learn"
        self.n_iters = n_iters
        # We initialize weights and bias as None because we don't know
        # the data shape (number of features) until we call fit()
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        # X.shape gives (number of rows, number of columns)
        n_samples, n_features = X.shape

        # Initialize weights as a vector of zeros (one for each feature/column)
        self.weights = np.zeros(n_features)
        # Initialize bias (the intercept) as a single zero
        self.bias = 0

        # Optimization loop (Gradient Descent)
        for _ in range(self.n_iters):
            # 1. Linear Formula: y = (X * weights) + bias
            # np.dot does the matrix multiplication between input and weights
            y_pred = np.dot(X, self.weights) + self.bias

            # 2. Calculate Gradients (the direction of steepest ascent)
            # dw = (1/n) * sum of (X_transposed * (predictions - actual_y))
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            # db = (1/n) * sum of (predictions - actual_y)
            db = (1 / n_samples) * np.sum(y_pred - y)

            # 3. Update Parameters: Move weights in the opposite direction
            # of the gradient to minimize the error (Learning Rate * Gradient)
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        # Once the model is trained, use the final weights and bias
        # to calculate the output for new data
        y_pred = np.dot(X, self.weights) + self.bias
        return y_pred

In [10]:
# Logistic Regression

In [11]:
class LogisticReg:
    # 1. Initialize the model's hyperparameters
    def __init__(self, logreg=0.001, n_iter=1000):
        self.logreg = logreg  # This is the learning rate (step size)
        self.n_iter = n_iter  # Number of times to run gradient descent
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        # Get dimensions: X is a matrix of (samples, features)
        n_samples, n_features = X.shape

        # Initialize weights as zeros (one for each feature) and bias as 0
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent: The core learning process
        for _ in range(self.n_iter):
            # Step A: Calculate the linear combination (z = wx + b)
            linear_model = np.dot(X, self.weights) + self.bias

            # Step B: Apply the Sigmoid function to get a probability (0 to 1)
            # This is the "Logistic" part of Logistic Regression
            y_predicted = self._sigmoid(linear_model)

            # Step C: Calculate the gradients (the error derivatives)
            # We compare our prediction (0.8) to the real label (1.0)
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Step D: Update weights and bias to reduce the error
            self.weights -= self.logreg * dw
            self.bias -= self.logreg * db

    def predict(self, X):
        # Step 1: Calculate the linear output for new data
        linear_model = np.dot(X, self.weights) + self.bias

        # Step 2: Convert those values into probabilities using sigmoid
        y_predicted = self._sigmoid(linear_model)

        # Step 3: Classification threshold
        # If probability > 0.5, predict Class 1. Otherwise, predict Class 0.
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    # The Sigmoid Function: It "squashes" any value into the range [0, 1]
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))



In [12]:
# Naive Bayes

In [14]:
import numpy as np

class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Identify the unique labels (e.g., [0, 1] or ['cat', 'dog'])
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # Initialize tables to store the 'stats' for each class
        # We need a mean and variance for every feature in every class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        # Prior is the probability of a class appearing in the dataset
        self._prior = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            # Create a subset of X containing only rows belonging to class 'c'
            X_c = X[y == c]

            # Calculate mean and variance for each feature of this class
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)

            # Prior = (number of samples in class c) / (total samples)
            self._prior[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        # Predict the label for every row in the input X
        y_predict = [self._predict(x) for x in X]
        return np.array(y_predict)

    def _predict(self, x):
        posteriors = []

        # Calculate the probability for each class
        for idx, c in enumerate(self._classes):
            # We use Log transformation here.
            # Adding logs is mathematically the same as multiplying probabilities,
            # but it prevents "numerical underflow" (numbers getting too small for the computer).
            prior = np.log(self._prior[idx])

            # Class conditional is the likelihood of the data given the class
            class_conditional = np.sum(np.log(self._pdf(idx, x)))

            # Posterior = log(Prior) + log(Likelihood)
            posterior = prior + class_conditional
            posteriors.append(posterior)

        # Pick the class with the highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        # Probability Density Function (Gaussian/Normal Distribution)
        # This calculates "how likely" a value 'x' is given the mean and variance
        mean = self._mean[class_idx]
        var = self._var[class_idx]

        # The Gaussian formula: (1 / sqrt(2 * pi * var)) * exp(- (x - mean)^2 / (2 * var))
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [16]:
# K-Means

In [17]:
# A helper function to find the distance between two points
def euclidean_distance(x1, x2):
    # Standard formula: square root of the sum of squared differences
    return np.sqrt(np.sum((x1 - x2) ** 2))

class KMeans:
    def __init__(self, k=5, max_iters=100):
        self.k = k # Number of clusters we want to find
        self.max_iters = max_iters # Safety limit to stop the loop
        self.centroids = [] # The 'center' point of each cluster
        self.clusters = [[] for _ in range(self.k)] # List of indices for each cluster

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # 1. Initialization: Pick K random data points to be our starting centers
        random_sample_idxs = np.random.choice(self.n_samples, self.k, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]

        # 2. Optimization Loop
        for _ in range(self.max_iters):
            # A. Assign every point to the nearest centroid
            self.clusters = self._create_clusters(self.centroids)

            # B. Calculate new centroids (move the centers to the middle of their clusters)
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)

            # C. Check for convergence: if the centers stopped moving, we are done!
            if self._is_converged(centroids_old, self.centroids):
                break

        # 3. Final labels: return which cluster index (0 to K-1) each point belongs to
        return self._get_cluster_labels(self.clusters)

    def _get_cluster_labels(self, clusters):
        # Create an empty array to fill with cluster IDs
        labels = np.empty(self.n_samples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                labels[sample_idx] = cluster_idx
        return labels

    def _create_clusters(self, centroids):
        # Temporary storage for the points assigned to each cluster
        clusters = [[] for _ in range(self.k)]
        for idx, sample in enumerate(self.X):
            # Find which center is closest to this specific point
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def _closest_centroid(self, sample, centroids):
        # Calculate distance from the point to every centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        # Return the index of the smallest distance
        return np.argmin(distances)

    def _get_centroids(self, clusters):
        # Initialize an array for new centers
        centroids = np.zeros((self.k, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            # Calculate the mean (average position) of all points in this cluster
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # Check if any of the centroids moved at all
        distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.k)]
        return sum(distances) == 0