# Part 1: Numbers

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score


In [75]:
# Load the digits dataset
digits = datasets.load_digits()
X = digits['data']
Y = digits['target']


In [76]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [77]:
# 1. Model with 64 features
dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)
y_pred_full_train = dt_full.predict(X_train)
y_pred_full_test = dt_full.predict(X_test)
acc_full_train = accuracy_score(y_train, y_pred_full_train)
acc_full_test = accuracy_score(y_test, y_pred_full_test)
print("Accuracy with all 64 features - Train:", acc_full_train)
print("Accuracy with all 64 features - Test:", acc_full_test)

Accuracy with all 64 features - Train: 1.0
Accuracy with all 64 features - Test: 0.8425925925925926


In [78]:
# 2. Feature Selection using Chi-Square Test (Top 25 features)
kbest = SelectKBest(chi2, k=25)
X_train_kbest = kbest.fit_transform(X_train, y_train)
X_test_kbest = kbest.transform(X_test)

dt_kbest = DecisionTreeClassifier(random_state=42)
dt_kbest.fit(X_train_kbest, y_train)
y_pred_kbest_train = dt_kbest.predict(X_train_kbest)
y_pred_kbest_test = dt_kbest.predict(X_test_kbest)
acc_kbest_train = accuracy_score(y_train, y_pred_kbest_train)
acc_kbest_test = accuracy_score(y_test, y_pred_kbest_test)
print("Accuracy with 25 selected features (Chi-Square) - Train:", acc_kbest_train)
print("Accuracy with 25 selected features (Chi-Square) - Test:", acc_kbest_test)

Accuracy with 25 selected features (Chi-Square) - Train: 1.0
Accuracy with 25 selected features (Chi-Square) - Test: 0.8629629629629629


In [79]:
# 3. Feature Reduction using PCA (Top 25 components)
pca = PCA(n_components=25)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

dt_pca = DecisionTreeClassifier(random_state=42)
dt_pca.fit(X_train_pca, y_train)
y_pred_pca_train = dt_pca.predict(X_train_pca)
y_pred_pca_test = dt_pca.predict(X_test_pca)
acc_pca_train = accuracy_score(y_train, y_pred_pca_train)
acc_pca_test = accuracy_score(y_test, y_pred_pca_test)
print("Accuracy with 25 PCA components - Train:", acc_pca_train)
print("Accuracy with 25 PCA components - Test:", acc_pca_test)


Accuracy with 25 PCA components - Train: 1.0
Accuracy with 25 PCA components - Test: 0.8407407407407408


In [80]:
# Compare models
print("\nModel Comparisons:")
print(f"Full feature model accuracy - Train: {acc_full_train:.4f}, Test: {acc_full_test:.4f}")
print(f"Chi-Square selected feature model accuracy - Train: {acc_kbest_train:.4f}, Test: {acc_kbest_test:.4f}")
print(f"PCA reduced feature model accuracy - Train: {acc_pca_train:.4f}, Test: {acc_pca_test:.4f}")


Model Comparisons:
Full feature model accuracy - Train: 1.0000, Test: 0.8426
Chi-Square selected feature model accuracy - Train: 1.0000, Test: 0.8630
PCA reduced feature model accuracy - Train: 1.0000, Test: 0.8407


## Which Model is More Likely to Overfit or Underfit?
- **Full feature model (64 features):** This model is more likely to overfit because it has a high number of features, capturing noise along with patterns. The high training accuracy compared to test accuracy suggests potential overfitting.
- **Chi-Square selection (25 features):** This model performs the best, suggesting it keeps the most relevant features while reducing noise. A smaller gap between training and test accuracy indicates better generalization.
- **PCA (25 components):** PCA focuses on variance and may lose some useful features, leading to slight underfitting.

## Model Comparison
| Model | Features | Train Accuracy | Test Accuracy |
|--------|----------|----------------|--------------|
| Full Model | 64 | 1.00 | 0.8426 |
| Chi-Square | 25 | 1.00| 0.8630 |
| PCA | 25 | 1.00 | 0.8407 |

- The **full model** is likely overfitting, as the train accuracy is significantly higher than the test accuracy.
- The **Chi-Square model is the best performer**, achieving the highest test accuracy with minimal overfitting.
- The **PCA model may underfit** due to loss of useful information, as indicated by lower training accuracy.