<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA and Logistic Regression on the UCI Wine Dataset
#
# This notebook implements the steps described in the LaTeX report:
# - Load and explore the UCI Wine dataset
# - Standardize features
# - Compute PCA (covariance, eigenvalues, eigenvectors)
# - Visualize eigenvalues (scree plot) and PC scores
# - Train logistic regression on original features
# - Train logistic regression on PCA-transformed features
# - Compare performance and simple training times

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.pipeline import Pipeline
import time


# Make plots a bit larger
plt.rcParams["figure.figsize"] = (6, 4)

## Step 1: Dataset Selection, Loading, and EDA

In [None]:
wine = load_wine()
X = wine.data
y = wine.target
feature_names = wine.feature_names
target_names = wine.target_names

df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

print("Shape:", df.shape)
print("\nHead:")
display(df.head())

print("\nInfo:")
print(df.info())

print("\nSummary statistics:")
display(df.describe())

print("\nClass distribution:")
print(df["target"].value_counts().sort_index(), " (classes:", list(target_names), ")")

print("\nMissing values per column:")
print(df.isnull().sum())


# For this dataset, there are no missing values. We will still standardize
# all numerical features before performing PCA and training the classifier.

# ## Step 2: Standardization (for PCA exploration)


In [None]:
scaler_full = StandardScaler()
Z = scaler_full.fit_transform(X)  # standardized features for PCA visualization

Z_df = pd.DataFrame(Z, columns=feature_names)
display(Z_df.head())

## Step 3: PCA Computation (Covariance, Eigenvalues, Scree Plot)

In [None]:
# Covariance matrix of standardized data
S = np.cov(Z.T)
print("Covariance matrix shape:", S.shape)

# Eigen-decomposition
eigvals, eigvecs = np.linalg.eigh(S)  # eigh since S is symmetric
# Sort in descending order
idx = np.argsort(eigvals)[::-1]
eigvals = eigvals[idx]
eigvecs = eigvecs[:, idx]

print("Eigenvalues (descending):")
print(eigvals)

explained_var = eigvals / eigvals.sum()
cum_explained_var = np.cumsum(explained_var)

print("\nProportion of variance explained:")
print(explained_var)
print("\nCumulative proportion:")
print(cum_explained_var)

# Scree plot from manual eigenvalues
plt.figure()
plt.plot(range(1, len(eigvals) + 1), eigvals, marker="o")
plt.xlabel("Principal Component")
plt.ylabel("Eigenvalue")
plt.title("Scree Plot (Wine Data)")
plt.grid(True)
plt.tight_layout()
plt.savefig("scree_plot.png", dpi=300)
plt.show()

# Compute PCA with scikit-learn to obtain scores and compare explained variance.


In [None]:
pca_full = PCA()
pca_full.fit(Z)

print("Explained variance ratio (sklearn):")
print(pca_full.explained_variance_ratio_)
print("Cumulative:")
print(np.cumsum(pca_full.explained_variance_ratio_))

# Choose M components to explain >= 95% variance
cum_ratio = np.cumsum(pca_full.explained_variance_ratio_)
M_95 = np.where(cum_ratio >= 0.95)[0][0] + 1
print(f"\nNumber of components to reach >= 95% variance: M = {M_95}")


## Step 4: Visualizing PC Scores (PC1 vs PC2)

In [None]:
scores = pca_full.transform(Z)  # PCA scores for all samples
pc1 = scores[:, 0]
pc2 = scores[:, 1]

plt.figure()
for class_index, class_name in enumerate(target_names):
    mask = (y == class_index)
    plt.scatter(pc1[mask], pc2[mask], alpha=0.7, label=class_name)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Scores: PC1 vs PC2 (Wine Data)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("pc1_pc2_scatter.png", dpi=300)
plt.show()

## Step 5: Baseline Model on Original Features
#
# Train a multiclass logistic regression model on standardized original features.

In [None]:
_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

baseline_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto"))
])

t0 = time.time()
baseline_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_baseline = baseline_pipeline.predict(X_test)

acc_baseline = accuracy_score(y_test, y_pred_baseline)
cm_baseline = confusion_matrix(y_test, y_pred_baseline)

print("Baseline logistic regression on original features")
print("Accuracy:", acc_baseline)
print("Classification report:")
print(classification_report(y_test, y_pred_baseline, target_names=target_names))
print("Confusion matrix:")
print(cm_baseline)
print("Training time (s):", t1 - t0)

## Step 6: PCA Transformation (Train and Test)

In [None]:
# We'll use M = 10 components (around 95% variance in this dataset).
M = 10

pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=M)),
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto"))
])

t0 = time.time()
pca_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_pca = pca_pipeline.predict(X_test)

acc_pca = accuracy_score(y_test, y_pred_pca)
cm_pca = confusion_matrix(y_test, y_pred_pca)

print(f"PCA + logistic regression (M = {M} PCs)")
print("Accuracy:", acc_pca)
print("Classification report:")
print(classification_report(y_test, y_pred_pca, target_names=target_names))
print("Confusion matrix:")
print(cm_pca)
print("Training time (s):", t1 - t0)

# For comparison, we can also examine a very aggressive reduction to only 2 PCs.


In [None]:
pca2_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto"))
])

t0 = time.time()
pca2_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_pca2 = pca2_pipeline.predict(X_test)

acc_pca2 = accuracy_score(y_test, y_pred_pca2)
cm_pca2 = confusion_matrix(y_test, y_pred_pca2)

print("PCA + logistic regression (M = 2 PCs)")
print("Accuracy:", acc_pca2)
print("Classification report:")
print(classification_report(y_test, y_pred_pca2, target_names=target_names))
print("Confusion matrix:")
print(cm_pca2)
print("Training time (s):", t1 - t0)

## Step 8: Compare Model Performance

In [None]:
results = pd.DataFrame({
    "Model": ["Baseline (13 features)", "PCA (10 PCs)", "PCA (2 PCs)"],
    "Accuracy": [acc_baseline, acc_pca, acc_pca2],
})

display(results)

plt.figure()
plt.bar(results["Model"], results["Accuracy"])
plt.ylim(0.8, 1.0)
plt.ylabel("Accuracy")
plt.title("Accuracy Comparison: Original vs PCA-Transformed Features")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.savefig("accuracy_comparison.png", dpi=300)
plt.show()