# Assignment 2
## CS6140 Machine Learning
## Zhiruo Zhao

## Task 2: Classification Task
### Dataset: Breast Cancer Wisconsin (Diagnostic)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# 1. Data Preprocessing:
- Load and clean the data.
- Normalize the features if necessary.
- Apply appropriate preprocessing suitable for classification problem.
- Split the data into training and test sets.

The dataset contains 569 entries with 32 columns. The key observations are:

The ID column is not useful for classification and should be dropped.
The Diagnosis column is the target variable, with values "M" (Malignant) and "B" (Benign).
The remaining 30 columns are numerical features.

In [8]:
# Load the dataset
data_path = 'breast+cancer+wisconsin+diagnostic/wdbc.data'
names_path = 'breast+cancer+wisconsin+diagnostic/wdbc.names'

# Based on the Breast Cancer Wisconsin (Diagnostic) dataset structure:
# The dataset has the following structure:
# ID, Diagnosis (M = malignant, B = benign), followed by 30 real-valued features
column_names = [
    'ID', 'Diagnosis', 'Radius_mean', 'Texture_mean', 'Perimeter_mean', 'Area_mean',
    'Smoothness_mean', 'Compactness_mean', 'Concavity_mean', 'Concave_points_mean',
    'Symmetry_mean', 'Fractal_dimension_mean', 'Radius_se', 'Texture_se',
    'Perimeter_se', 'Area_se', 'Smoothness_se', 'Compactness_se', 'Concavity_se',
    'Concave_points_se', 'Symmetry_se', 'Fractal_dimension_se', 'Radius_worst',
    'Texture_worst', 'Perimeter_worst', 'Area_worst', 'Smoothness_worst',
    'Compactness_worst', 'Concavity_worst', 'Concave_points_worst', 'Symmetry_worst',
    'Fractal_dimension_worst'
]

# Load the data
df = pd.read_csv(data_path, names=column_names)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,ID,Diagnosis,Radius_mean,Texture_mean,Perimeter_mean,Area_mean,Smoothness_mean,Compactness_mean,Concavity_mean,Concave_points_mean,...,Radius_worst,Texture_worst,Perimeter_worst,Area_worst,Smoothness_worst,Compactness_worst,Concavity_worst,Concave_points_worst,Symmetry_worst,Fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [9]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop the ID column
df = df.drop(columns=["ID"])

# Encode the Diagnosis column
df["Diagnosis"] = LabelEncoder().fit_transform(df["Diagnosis"])  # M -> 1, B -> 0

# Separate features and target variable
X = df.drop(columns=["Diagnosis"])
y = df["Diagnosis"]

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Display the shapes of the datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

# 2. Implement Gaussian NaiveBayes (GNB) and Gaussian Discriminant Analysis (GDA):
- Use shared co-variance as well as class specific co-variance for GDA
- Implement the fit and predict functions.
- Train the model on the training set.
- Evaluate the model on the test set

In [7]:

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0) + 1e-9  # 避免除零
            self.priors[c] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        posteriors = []

        for c in self.classes:
            prior = np.log(self.priors[c])
            likelihood = -0.5 * np.sum(np.log(2 * np.pi * self.var[c]))
            likelihood -= 0.5 * np.sum(((x - self.mean[c]) ** 2) / self.var[c])
            posterior = prior + likelihood
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

# Train GNB Model
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)

# Predict
y_pred_gnb = gnb.predict(X_test)

# Accuracy
from sklearn.metrics import accuracy_score

accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
print(accuracy_gnb)

0.9649122807017544


In [10]:
import numpy as np

class GaussianDiscriminantAnalysis:
    def fit(self, X, y, shared_covariance=True):
        self.classes = np.unique(y)
        self.means = {c: np.mean(X[y == c], axis=0) for c in self.classes}
        self.priors = {c: len(X[y == c]) / len(X) for c in self.classes}
        
        if shared_covariance:
            # Shared covariance matrix for all classes
            self.covariance = np.cov(X.T)
        else:
            # Separate covariance matrix for each class
            self.covariances = {c: np.cov(X[y == c].T) for c in self.classes}
        
        self.shared_covariance = shared_covariance

    def predict(self, X):
        posteriors = np.zeros((X.shape[0], len(self.classes)))

        for i, c in enumerate(self.classes):
            mean = self.means[c]
            prior = np.log(self.priors[c])
            
            if self.shared_covariance:
                cov_inv = np.linalg.inv(self.covariance)
                determinant = np.linalg.det(self.covariance)
            else:
                cov_inv = np.linalg.inv(self.covariances[c])
                determinant = np.linalg.det(self.covariances[c])
            
            diff = X - mean
            likelihood = -0.5 * np.log(determinant) - 0.5 * np.sum(diff @ cov_inv * diff, axis=1)
            
            posteriors[:, i] = prior + likelihood

        return self.classes[np.argmax(posteriors, axis=1)]

# Train and evaluate GDA with shared covariance
gda_shared = GaussianDiscriminantAnalysis()
gda_shared.fit(X_train, y_train, shared_covariance=True)
y_pred_gda_shared = gda_shared.predict(X_test)
gda_shared_accuracy = np.mean(y_pred_gda_shared == y_test)

# Train and evaluate GDA with class-specific covariance
gda_class_specific = GaussianDiscriminantAnalysis()
gda_class_specific.fit(X_train, y_train, shared_covariance=False)
y_pred_gda_class_specific = gda_class_specific.predict(X_test)
gda_class_specific_accuracy = np.mean(y_pred_gda_class_specific == y_test)

print(f"GDA (Shared Covariance) Accuracy: {gda_shared_accuracy:.4f}")
print(f"GDA (Class-Specific Covariance) Accuracy: {gda_class_specific_accuracy:.4f}")

GDA (Shared Covariance) Accuracy: 0.9035
GDA (Class-Specific Covariance) Accuracy: 0.9474
