**Import Libraries**

In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image

from skimage.io import imread
from skimage.transform import resize
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score

import matplotlib.pyplot as plt

**Load and Preprocess Data**

In [2]:
def load_images_from_csv(data_dir, csv_path, image_size=(50, 50), limit=None):
    df = pd.read_csv(csv_path)
    if limit is not None:
        df = df.head(limit)

    all_images = []
    all_labels = []

    for _, row in df.iterrows():
        image_id = row['image_id']
        label = row['label']

        img_path = os.path.join(data_dir, 'train_images', image_id)
        with Image.open(img_path) as pil_img:
            pil_img = pil_img.convert('RGB')
            pil_img = pil_img.resize(image_size, Image.BILINEAR)

            img_np = np.array(pil_img)
            all_images.append(img_np)
            all_labels.append(label)

    X = np.array(all_images)
    y = np.array(all_labels)

    class_counts = pd.Series(y).value_counts()
    
    print("Class distribution (number of samples per class):")
    for label, count in class_counts.items():
        print(f"  Class '{label}': {count} samples")

    return X, y

# Set paths and parameters
data_dir = '.'
csv_path = os.path.join(data_dir, 'train.csv')
IMAGE_SIZE = (32, 32)

# Load data
X, y = load_images_from_csv(data_dir=data_dir, csv_path=csv_path, image_size=IMAGE_SIZE)

Class distribution (number of samples per class):
  Class '3': 13158 samples
  Class '4': 2577 samples
  Class '2': 2386 samples
  Class '1': 2189 samples
  Class '0': 1087 samples


**Encode Labels**

In [3]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
class_names = label_encoder.classes_

# Normalize pixel values to [0, 1]
X = X / 255.0

**Split Data**

In [4]:
# First split into train + (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.4, random_state=42)

# Then split (validation + test) into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 

**Define the Model and Set Strategy (most freq)**

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')

**Train the Model**

In [6]:
dummy_clf.fit(X_train, y_train)

**Evaluate the Model**

In [8]:
# Make Predictions (on validation and test sets)
y_pred_test = dummy_clf.predict(X_test)     # Predictions on test set
y_pred_train = dummy_clf.predict(X_train)  # Predictions on test set

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='weighted')

print(f'Training Accuracy: {accuracy_train:.3f}')
print(f'Test Accuracy: {accuracy_test:.3f}')
print(f'F1 Score: {f1_test:.3f}')
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))


Training Accuracy: 0.607
Test Accuracy: 0.634
F1 Score: 0.492
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.00      0.00      0.00       424
           2       0.00      0.00      0.00       437
           3       0.63      1.00      0.78      2715
           4       0.00      0.00      0.00       492

    accuracy                           0.63      4280
   macro avg       0.13      0.20      0.16      4280
weighted avg       0.40      0.63      0.49      4280

