### Logistic Regression

In [2]:
import numpy as np
from typing import Optional

# Implementation of Logistic Regression from scratch
class LogisticRegression:
    def __init__(self, lr: float = 0.1, n_iter: int = 1000) -> None:
        # lr: learning rate for gradient descent
        # n_iter: number of iterations for training
        self.lr: float = lr
        self.n_iter: int = n_iter
        # weights and bias will be learned during training
        self.weights: np.ndarray = np.array([])
        self.bias: float = 0.0

    def sigmoid(self, z: np.ndarray) -> np.ndarray:
        # Sigmoid activation function to map predictions to probabilities
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        # X: feature matrix, shape (n_samples, n_features)
        # y: target vector, shape (n_samples,)
        n_samples, n_features = X.shape
        # Initialize weights and bias to zeros
        self.weights = np.zeros(n_features)
        self.bias = 0.0

        # Gradient descent for n_iter steps
        for _ in range(self.n_iter):
            # Linear model: Xw + b
            logits = np.dot(X, self.weights) + self.bias
            # Apply sigmoid to get probabilities
            pred_scores = self.sigmoid(logits)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (pred_scores - y))
            db = (1 / n_samples) * np.sum(pred_scores - y)

            # Update weights and bias
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict_probs(self, X: np.ndarray) -> np.ndarray:
        # Returns predicted probabilities for input features X
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        # Returns binary predictions (0 or 1) based on probability threshold 0.5
        y_predicted_probs = self.predict_probs(X)
        return (y_predicted_probs >= 0.5).astype(int)


# Example usage of the LogisticRegression class

# Generate a simple synthetic dataset for binary classification
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, random_state=42)

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of LogisticRegression and train it
clf = LogisticRegression(lr=0.1, n_iter=1000)
clf.fit(X_train, y_train)

# Make predictions on the test set
preds = clf.predict(X_test)
# Calculate and print the accuracy
accuracy = np.mean(preds == y_test)
print(f"Test accuracy: {accuracy:.2f}")

Test accuracy: 0.95


### KNN

In [3]:
import numpy as np
from collections import Counter
from typing import Any

class KNNClassifier:
    def __init__(self, k: int = 3) -> None:
        self.k: int = k

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        X shape (n_samples, n_features)
        y shape (n_samples,)
        """
        self.X_train: np.ndarray = X
        self.y_train: np.ndarray = y

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        X shape (n_samples, n_features)
        Returns:
            Predicted class labels for each sample in X. shape (n_samples,)
        """
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x: np.ndarray) -> Any:
        """
        x: np.ndarray, shape (n_features,)
            Single sample feature vector.
        Returns:
            Predicted class label for the sample.
        """
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Example usage:
knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_accuracy = np.mean(knn_preds == y_test)
print(f"KNN Test accuracy: {knn_accuracy:.2f}")

KNN Test accuracy: 0.95


### K means

In [None]:
import numpy as np

class KMeans:
    def __init__(self, n_clusters: int = 2, n_iter: int = 100, random_state: int = 42) -> None:
        self.n_clusters = n_clusters
        self.n_iter = n_iter
        self.random_state = random_state
        self.centroids: np.ndarray = np.ndarray([])

    def fit(self, X: np.ndarray) -> None:
        """
        X shape (n_samples, n_features)
        """
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        # Randomly initialize centroids by selecting random samples
        random_idxs = np.random.choice(n_samples, self.n_clusters, replace=False)
        self.centroids = X[random_idxs]

        for _ in range(self.n_iter):
            # Assign each sample to the nearest centroid
            labels = self.predict(X)
            # Compute new centroids as mean of assigned points
            new_centroids = []
            for i in range(self.n_clusters):
                # Get all points assigned to cluster i
                cluster_points = X[labels == i]
                if np.any(labels == i):
                    # If there are points assigned, compute their mean
                    centroid = cluster_points.mean(axis=0)
                else:
                    # If no points assigned, keep the old centroid
                    centroid = self.centroids[i]
                new_centroids.append(centroid)
            new_centroids = np.array(new_centroids)
            # If centroids do not change, break
            if np.allclose(self.centroids, new_centroids):
                break
            self.centroids = new_centroids

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        X: np.ndarray, shape (n_samples, n_features)
            Data to assign to clusters.
            Assigns each sample to the nearest centroid.
        Returns:
            np.ndarray, shape (n_samples,)
            Cluster index for each sample.
        """
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)


# Example usage:
kmeans = KMeans(n_clusters=4, n_iter=100, random_state=42)
kmeans.fit(X)
cluster_labels = kmeans.predict(X)
print("First 10 cluster assignments:", cluster_labels[:10])

First 10 cluster assignments: [2 1 0 3 1 3 0 2 0 1]
