### Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score

### Seed Configuration

In [2]:
# for loading bars on notebook
from tqdm import tqdm

# Set random seed
SEED = 385433  # my G number
np.random.seed(SEED)


### Read in Dataset

In [3]:

# Define dataset path
DATASET_PATH = "fashion"
# CATEGORIES = ['ankleboot', 'bag']
CATEGORIES = ['ankleboot', 'bag', 'coat', 'dress', 'pullover']#, 'sandal', 'shirt', 'sneaker', 'trouser', 'tshirt-top']

# Load images and labels with progress bar
X, y = [], []
for label, category in enumerate(CATEGORIES):
    folder_path = os.path.join(DATASET_PATH, category)
    files = os.listdir(folder_path)
    for file in tqdm(files, desc=f"Loading {category}"):
        img_path = os.path.join(folder_path, file)
        img = Image.open(img_path).convert('L')  # Convert to grayscale
        img_resized = img.resize((28, 28))  # Resize to 28x28
        X.append(np.array(img_resized).flatten())  # Flatten images
        y.append(label)

X = np.array(X)
y = np.array(y)

print('Read in dataset')

Loading ankleboot: 100%|█████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 4708.65it/s]
Loading bag: 100%|███████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 4394.36it/s]
Loading coat: 100%|██████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 3842.03it/s]
Loading dress: 100%|█████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 3881.49it/s]
Loading pullover: 100%|██████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 3992.99it/s]

Read in dataset





### Split Dataset into Train, Validation, and Test Sets

In [4]:
# Split into training, validation, and test sets with progress bar
print("Splitting dataset...")
with tqdm(total=2, desc="Splitting Data") as pbar:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)
    pbar.update(1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED, shuffle=True, stratify=y_train)
    pbar.update(1)

# Check dataset distribution
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Validation Set Shape:", X_val.shape, y_val.shape)
print("Test Set Shape:", X_test.shape, y_test.shape)
print("Class Distribution in Training Set:")
print(pd.Series(y_train).value_counts(normalize=True))


Splitting dataset...


Splitting Data: 100%|████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44.52it/s]

Training Set Shape: (22400, 784) (22400,)
Validation Set Shape: (5600, 784) (5600,)
Test Set Shape: (7000, 784) (7000,)
Class Distribution in Training Set:
0    0.2
2    0.2
1    0.2
3    0.2
4    0.2
Name: proportion, dtype: float64





### Define Models to Train

In [5]:
# Define models to train
models = {
    "SVM": SVC(kernel='rbf', random_state=SEED),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=SEED),
    "kNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=SEED)
}

# Train Models and Evaluate Performance

In [6]:
models.items()

dict_items([('SVM', SVC(random_state=385433)), ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=385433)), ('kNN', KNeighborsClassifier()), ('Random Forest', RandomForestClassifier(random_state=385433))])

In [None]:
import datetime, time
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

print("Checking model training times...")
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train[:500], y_train[:500])  # Train on small subset
    end_time = time.time()
    print(f"{name} took {end_time - start_time:.2f} seconds on a small subset.")
# Train models and evaluate performance
results = {}
cross_val_results = {}

print("Training Models...")

with tqdm(total=len(models), desc="Training Progress", leave=True) as pbar:
    for name, model in models.items():
        print(f"\nTraining {name}...")

        start_time = time.time()  # Track time

        # Train the model with a progress indicator
        for i in tqdm(range(1), desc=f"Fitting {name}", leave=False):
            model.fit(X_train, y_train)

        end_time = time.time()
        print(f"{name} training took {end_time - start_time:.2f} seconds.")

        # Make predictions
        y_pred = model.predict(X_val)

        # Cross-validation with progress bar
        cross_val_scores = []
        for score in tqdm(cross_val_score(model, X_train, y_train, cv=3), desc=f"Cross-validation {name}", leave=False):
            print(score)
            cross_val_scores.append(score)

        cross_val_score_avg = np.mean(cross_val_scores)

        results[name] = accuracy_score(y_val, y_pred)
        cross_val_results[name] = cross_val_score_avg

        print(f"{name} Classification Report:\n", classification_report(y_val, y_pred))
        pbar.update(1)  # Update the outer progress bar

print("\nModel training complete!")


2025-03-21 16:20:50
Checking model training times...
SVM took 0.04 seconds on a small subset.
Logistic Regression took 0.72 seconds on a small subset.
kNN took 0.00 seconds on a small subset.
Random Forest took 0.84 seconds on a small subset.
Training Models...


Training Progress:   0%|                                                                         | 0/4 [00:00<?, ?it/s]


Training SVM...



Fitting SVM:   0%|                                                                               | 0/1 [00:00<?, ?it/s][A
Fitting SVM: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:42<00:00, 42.32s/it][A
                                                                                                                       [A

SVM training took 42.33 seconds.


### Compare Model Performance (Accuracy and Cross-Validation)

In [None]:
# Compare Results
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
cross_val_df = pd.DataFrame.from_dict(cross_val_results, orient='index', columns=['Cross-Val Accuracy'])
print("\nModel Performance:")
print(results_df)
print("\nCross-Validation Performance:")
print(cross_val_df)

# Plot model accuracy comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=results_df.index, y=results_df["Accuracy"], palette="viridis")
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()

# Plot cross-validation accuracy comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=cross_val_df.index, y=cross_val_df["Cross-Val Accuracy"], palette="coolwarm")
plt.title("Cross-Validation Accuracy Comparison")
plt.ylabel("Cross-Val Accuracy")
plt.show()


# Evaluate the Best Model on Test Set

In [None]:
# Evaluate Best Model on Test Set
best_model_name = results_df["Accuracy"].idxmax()
best_model = models[best_model_name]
y_test_pred = best_model.predict(X_test)

print(f"\nBest Model: {best_model_name}")
print("Test Set Classification Report:")
print(classification_report(y_test, y_test_pred))
