# Assignment 2
## CS6140 Machine Learning
## Zhiruo Zhao

## Task 2: Classification Task
### Dataset: Breast Cancer Wisconsin (Diagnostic)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Perceptron as SKPerceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Data Preprocessing:
- Load and clean the data.
- Normalize the features if necessary.
- Apply appropriate preprocessing suitable for classification problem.
- Split the data into training and test sets.

The dataset contains 569 entries with 32 columns. The key observations are:

The ID column is not useful for classification and should be dropped.
The Diagnosis column is the target variable, with values "M" (Malignant) and "B" (Benign).
The remaining 30 columns are numerical features.

In [2]:
# Load the dataset
data_path = '../breast+cancer+wisconsin+diagnostic/wdbc.data'
names_path = '../breast+cancer+wisconsin+diagnostic/wdbc.names'

# Based on the Breast Cancer Wisconsin (Diagnostic) dataset structure:
# The dataset has the following structure:
# ID, Diagnosis (M = malignant, B = benign), followed by 30 real-valued features
column_names = [
    'ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean',
    'Smoothness_mean', 'Compactness_mean', 'Concavity_mean', 'Concave_points_mean',
    'Symmetry_mean', 'Fractal_dimension_mean', 'Radius_se', 'Texture_se',
    'Perimeter_se', 'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se',
    'Concave_points_se', 'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst',
    'Texture_worst', 'Perimeter_worst', 'Area_worst', 'Smoothness_worst',
    'Compactness_worst', 'Concavity_worst', 'Concave_points_worst', 'Symmetry_worst',
    'Fractal_dimension_worst'
]

# Load the data
df = pd.read_csv(data_path, names=column_names)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,ID,Diagnosis,Radius_mean,Texture_mean,Perimeter_mean,Area_mean,Smoothness_mean,Compactness_mean,Concavity_mean,Concave_points_mean,...,Radius_worst,Texture_worst,Perimeter_worst,Area_worst,Smoothness_worst,Compactness_worst,Concavity_worst,Concave_points_worst,Symmetry_worst,Fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Drop the ID column
df = df.drop(columns=["ID"])

# Encode the Diagnosis column
df["Diagnosis"] = LabelEncoder().fit_transform(df["Diagnosis"])  # M -> 1, B -> 0

# Separate features and target variable
X = df.drop(columns=["Diagnosis"])
y = df["Diagnosis"]

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Display the shapes of the datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

# 2. Implement Gaussian NaiveBayes (GNB) and Gaussian Discriminant Analysis (GDA):
- Use shared co-variance as well as class specific co-variance for GDA
- Implement the fit and predict functions.
- Train the model on the training set.
- Evaluate the model on the test set

In [4]:
class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0) + 1e-9  # 避免除零
            self.priors[c] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        posteriors = []

        for c in self.classes:
            prior = np.log(self.priors[c])
            likelihood = -0.5 * np.sum(np.log(2 * np.pi * self.var[c]))
            likelihood -= 0.5 * np.sum(((x - self.mean[c]) ** 2) / self.var[c])
            posterior = prior + likelihood
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

# Train GNB Model
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)

# Predict
y_pred_gnb = gnb.predict(X_test)

# Accuracy
from sklearn.metrics import accuracy_score
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
print(f"GNB Accuracy: ",accuracy_gnb)

GNB Accuracy:  0.9210526315789473


In [5]:
class GaussianDiscriminantAnalysis:
    def fit(self, X, y, shared_covariance=True):
        self.classes = np.unique(y)
        self.means = {c: np.mean(X[y == c], axis=0) for c in self.classes}
        self.priors = {c: len(X[y == c]) / len(X) for c in self.classes}
        
        if shared_covariance:
            # Shared covariance matrix for all classes
            self.covariance = np.cov(X.T)
        else:
            # Separate covariance matrix for each class
            self.covariances = {c: np.cov(X[y == c].T) for c in self.classes}
        
        self.shared_covariance = shared_covariance

    def predict(self, X):
        posteriors = np.zeros((X.shape[0], len(self.classes)))

        for i, c in enumerate(self.classes):
            mean = self.means[c]
            prior = np.log(self.priors[c])
            
            if self.shared_covariance:
                cov_inv = np.linalg.inv(self.covariance)
                determinant = np.linalg.det(self.covariance)
            else:
                cov_inv = np.linalg.inv(self.covariances[c])
                determinant = np.linalg.det(self.covariances[c])
            
            diff = X - mean
            likelihood = -0.5 * np.log(determinant) - 0.5 * np.sum(diff @ cov_inv * diff, axis=1)
            
            posteriors[:, i] = prior + likelihood

        return self.classes[np.argmax(posteriors, axis=1)]

# Train and evaluate GDA with shared covariance
gda_shared = GaussianDiscriminantAnalysis()
gda_shared.fit(X_train, y_train, shared_covariance=True)
y_pred_gda_shared = gda_shared.predict(X_test)
gda_shared_accuracy = np.mean(y_pred_gda_shared == y_test)

# Train and evaluate GDA with class-specific covariance
gda_class_specific = GaussianDiscriminantAnalysis()
gda_class_specific.fit(X_train, y_train, shared_covariance=False)
y_pred_gda_class_specific = gda_class_specific.predict(X_test)
gda_class_specific_accuracy = np.mean(y_pred_gda_class_specific == y_test)

print(f"GDA (Shared Covariance) Accuracy: {gda_shared_accuracy:.4f}")
print(f"GDA (Class-Specific Covariance) Accuracy: {gda_class_specific_accuracy:.4f}")

GDA (Shared Covariance) Accuracy: 0.9035
GDA (Class-Specific Covariance) Accuracy: 0.9474


# 3. Implement Logistic Regression:
- Derive the Logistic Regression equations and implement the fit function using gradient
- descent.
- Train the model on the training set.
- Evaluate the model on the test set.

In [6]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.theta = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        m, n = X.shape
        self.theta = np.zeros(n)  # Initialize weights
        
        for _ in range(self.epochs):
            linear_model = np.dot(X, self.theta)
            predictions = self.sigmoid(linear_model)
            gradient = np.dot(X.T, (predictions - y)) / m
            self.theta -= self.learning_rate * gradient  # Gradient descent step

    def predict(self, X):
        linear_model = np.dot(X, self.theta)
        probabilities = self.sigmoid(linear_model)
        return (probabilities >= 0.5).astype(int)  # Convert probabilities to class labels

# Train the Logistic Regression model
log_reg = LogisticRegression(learning_rate=0.01, epochs=1000)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)

print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")

Logistic Regression Accuracy: 0.9825


# 4. Implement Perceptron:
- Derive the Perceptron learning rule and implement the algorithm.
- Train the model on the training set.
- Evaluate the model on the test set.

In [7]:
class Perceptron:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = 0

    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n)
        y_transformed = np.where(y == 0, -1, 1)  # Convert labels {0,1} → {-1,1}

        for _ in range(self.epochs):
            for i in range(m):
                prediction = np.sign(np.dot(X[i], self.weights) + self.bias)
                if prediction != y_transformed[i]:  # Update if misclassified
                    self.weights += self.learning_rate * y_transformed[i] * X[i]
                    self.bias += self.learning_rate * y_transformed[i]

    def predict(self, X):
        return np.where(np.dot(X, self.weights) + self.bias >= 0, 1, 0)

# Train the Perceptron model
perceptron = Perceptron(learning_rate=0.01, epochs=1000)
perceptron.fit(X_train, y_train)

# Predict on the test set
y_pred_perceptron = perceptron.predict(X_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score
perceptron_accuracy = accuracy_score(y_test, y_pred_perceptron)

print(f"Perceptron Accuracy: {perceptron_accuracy:.4f}")

Perceptron Accuracy: 0.9123


# 5. Comparison and Analysis:
Compare your solutions with standard APIs in terms of model parameters and appropriate metrics (e.g., accuracy, precision, recall, F1 score).

In [12]:
from sklearn.linear_model import LogisticRegression, Perceptron as SKPerceptron

# Train Scikit-Learn models
gnb_sklearn = GaussianNB().fit(X_train, y_train)
log_reg_sklearn = LogisticRegression(max_iter=1000).fit(X_train, y_train)
perceptron_sklearn = SKPerceptron(max_iter=1000, eta0=0.01).fit(X_train, y_train)
lda_sklearn = LDA().fit(X_train, y_train)

# Predictions
y_pred_gnb_sklearn = gnb_sklearn.predict(X_test)
y_pred_log_reg_sklearn = log_reg_sklearn.predict(X_test)
y_pred_perceptron_sklearn = perceptron_sklearn.predict(X_test)
y_pred_lda_sklearn = lda_sklearn.predict(X_test)

# Compute evaluation metrics
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"{model_name} Results:")
    print(f"  - Accuracy:  {accuracy:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall:    {recall:.4f}")
    print(f"  - F1 Score:  {f1:.4f}")
    print("-" * 40)

# Evaluate all models
evaluate_model(y_test, y_pred_gnb, "Custom Gaussian Naïve Bayes")
evaluate_model(y_test, y_pred_gnb_sklearn, "Sklearn Gaussian Naïve Bayes")
evaluate_model(y_test, y_pred_gda_shared, "Custom GDA (Shared Covariance)")
evaluate_model(y_test, y_pred_gda_class_specific, "Custom GDA (Class-Specific Covariance)")
evaluate_model(y_test, y_pred_lda_sklearn, "Sklearn LDA")
evaluate_model(y_test, y_pred_log_reg, "Custom Logistic Regression")
evaluate_model(y_test, y_pred_log_reg_sklearn, "Sklearn Logistic Regression")
evaluate_model(y_test, y_pred_perceptron, "Custom Perceptron")
evaluate_model(y_test, y_pred_perceptron_sklearn, "Sklearn Perceptron")


Custom Gaussian Naïve Bayes Results:
  - Accuracy:  0.9211
  - Precision: 0.9231
  - Recall:    0.8571
  - F1 Score:  0.8889
----------------------------------------
Sklearn Gaussian Naïve Bayes Results:
  - Accuracy:  0.9211
  - Precision: 0.9231
  - Recall:    0.8571
  - F1 Score:  0.8889
----------------------------------------
Custom GDA (Shared Covariance) Results:
  - Accuracy:  0.9035
  - Precision: 1.0000
  - Recall:    0.7381
  - F1 Score:  0.8493
----------------------------------------
Custom GDA (Class-Specific Covariance) Results:
  - Accuracy:  0.9474
  - Precision: 0.9286
  - Recall:    0.9286
  - F1 Score:  0.9286
----------------------------------------
Sklearn LDA Results:
  - Accuracy:  0.9649
  - Precision: 1.0000
  - Recall:    0.9048
  - F1 Score:  0.9500
----------------------------------------
Custom Logistic Regression Results:
  - Accuracy:  0.9825
  - Precision: 0.9762
  - Recall:    0.9762
  - F1 Score:  0.9762
----------------------------------------
Sklear

## **comparison table** 

| **Model**                           | **Accuracy** | **Precision** | **Recall** | **F1 Score** | **Notes** |
|-------------------------------------|------------|------------|--------|------------|-----------------------------|
| **Custom Gaussian Naïve Bayes**     | 0.9211     | 0.9231     | 0.8571 | 0.8889     | Assumes feature independence |
| **Sklearn Gaussian Naïve Bayes**    | 0.9211     | 0.9231     | 0.8571 | 0.8889     | Matches custom implementation |
| **Custom GDA (Shared Covariance)**  | 0.9035     | 1.0000     | 0.7381 | 0.8493     | Assumes equal covariance across classes |
| **Custom GDA (Class-Specific)**     | 0.9474     | 0.9286     | 0.9286 | 0.9286     | More flexible than shared covariance |
| **Sklearn LDA**                     | 0.9649     | 1.0000     | 0.9048 | 0.9500     | Optimized version of GDA |
| **Custom Logistic Regression**      | 0.9825     | 0.9762     | 0.9762 | 0.9762     | Strong performance, well-tuned gradient descent |
| **Sklearn Logistic Regression**     | 0.9737     | 0.9756     | 0.9524 | 0.9639     | Slightly lower than custom but still strong |
| **Custom Perceptron**               | 0.9123     | 0.8810     | 0.8810 | 0.8810     | Struggles with non-linearly separable data |
| **Sklearn Perceptron**              | 0.9561     | 0.9744     | 0.9048 | 0.9383     | Better optimization & convergence |

---

### Key Observations
**Gaussian Naïve Bayes (GNB)** performs well but assumes feature independence.  
**Custom and Sklearn GDA results vary**: class-specific covariance improves accuracy significantly.  
**Logistic Regression outperforms GDA**, proving its robustness in classification tasks.  
**Sklearn Perceptron outperforms the custom version**, likely due to improved optimization methods.  
**Perceptron is weaker overall**, reinforcing its limitations on linearly separable data. 

### Strengths and Weaknesses of Each Algorithm

1. **Custom Gaussian Naïve Bayes**  
   - **Strengths:** Simple, interpretable.  
   - **Weaknesses:** Assumes feature independence, which may not hold in many cases.

2. **Sklearn Gaussian Naïve Bayes**  
   - **Strengths:** Fast and reliable, matches custom version.  
   - **Weaknesses:** Same feature independence assumption.

3. **Custom GDA (Shared Covariance)**  
   - **Strengths:** Flexible, models class distributions.  
   - **Weaknesses:** Assumes equal covariance across classes, limiting flexibility.

4. **Custom GDA (Class-Specific)**  
   - **Strengths:** More flexible, different covariance for each class.  
   - **Weaknesses:** Risk of overfitting with limited data.

5. **Sklearn LDA**  
   - **Strengths:** Optimized, good balance for class distribution.  
   - **Weaknesses:** Assumes equal covariance across classes.

6. **Custom Logistic Regression**  
   - **Strengths:** Excellent performance, well-tuned.  
   - **Weaknesses:** Struggles with high-dimensional data without regularization.

7. **Sklearn Logistic Regression**  
   - **Strengths:** Optimized, good results for linear separability.  
   - **Weaknesses:** Slightly less powerful than custom version.

8. **Custom Perceptron**  
   - **Strengths:** Simple, effective for linearly separable data.  
   - **Weaknesses:** Struggles with non-linearly separable data.

9. **Sklearn Perceptron**  
   - **Strengths:** Faster convergence, better optimization.  
   - **Weaknesses:** Same issues with non-linearly separable data.

### Linear Separability

**Linear Separability**: Data is linearly separable if a straight line (or hyperplane in higher dimensions) can separate classes.

**Checking Linearly Separable Data**:
1. **Visual Inspection:** Plot data to check for a separable line.
2. **Model Performance:** If linear classifiers (like Logistic Regression, Perceptron) perform well, data is likely separable.
3. **Decision Boundaries:** Plot boundaries to see if they separate classes effectively.

**Conclusion**: Custom Perceptron struggled, indicating non-linear separability, while Logistic Regression and Sklearn Perceptron performed well, suggesting linear separability.
