In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier
from skimage.feature import graycomatrix, graycoprops, canny, corner_harris, hog
import cv2
import time

# Utility Functions
def load_image(image_path):
    """Load and return an image as a NumPy array."""
    try:
        return plt.imread(image_path)
    except FileNotFoundError:
        print(f"Error: {image_path} not found.")
        return None

def flatten_image(image):
    """Flatten the image array for analysis."""
    return image.flatten()

def load_images(image_names, directory):
    """Load and flatten multiple images."""
    return Parallel(n_jobs=-1)(
        delayed(lambda img_name: flatten_image(load_image(f"{directory}/{img_name}")))(img_name)
        for img_name in image_names
    )

# Feature Extraction
def extract_features(image):
    """Extract various features from an image."""
    temp_img = cv2.cvtColor(image.reshape(32, 32, 3), cv2.COLOR_RGB2GRAY)
    temp_img = cv2.normalize(temp_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)

    # GLCM with additional angles
    glcm = graycomatrix(temp_img, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4])
    glcm_props = [
        graycoprops(glcm, prop).flatten()
        for prop in ['contrast', 'dissimilarity', 'homogeneity', 'ASM', 'energy', 'correlation']
    ]

    # Edge detection, corner detection, and HOG with adjusted parameters
    edges = canny(temp_img, sigma=1).flatten()
    corners = corner_harris(temp_img).flatten()
    hog_feature = hog(temp_img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys').flatten()

    return np.concatenate(glcm_props + [edges, corners, hog_feature])

# Preprocessing
def preprocess_data(train_csv, test_csv):
    """Preprocess the data by loading images and extracting features."""
    train_data = pd.read_csv(train_csv)
    test_data = pd.read_csv(test_csv)

    # Load and flatten images
    train_images = load_images(train_data['im_name'], 'train_ims')
    test_images = load_images(test_data['im_name'], 'test_ims')
    train_data['image'] = train_images
    test_data['image'] = test_images

    # Combine train and test data
    concat_data = pd.concat([train_data, test_data])

    # Extract features for all images
    sec_feature_list = Parallel(n_jobs=-1)(
        delayed(extract_features)(concat_data['image'].iloc[i]) for i in tqdm(range(concat_data.shape[0]))
    )
    concat_data['sec_feature'] = sec_feature_list

    # Combine secondary and image features
    concat_sec_feature = np.vstack(concat_data['sec_feature'].to_numpy())
    concat_image_feature = np.vstack(concat_data['image'].to_numpy())
    concat_feature = np.hstack([concat_sec_feature, concat_image_feature])

    return concat_feature, train_data, test_data

# Standardization
def standardize_features(concat_feature):
    """Standardize the features using StandardScaler."""
    return StandardScaler().fit_transform(concat_feature)

# Dimensionality Reduction
def apply_dimensionality_reduction(concat_feature_sc, n_components=10):
    """Apply dimensionality reduction using PCA."""
    reducer = PCA(n_components=n_components)
    return reducer.fit_transform(concat_feature_sc)

# Model Training
def train_model(X_train, y_train):
    """Train a HistGradientBoostingClassifier model with optimized hyperparameters."""
    model = HistGradientBoostingClassifier(
        learning_rate=0.1,  # Slightly reduce learning rate for better generalization
        max_iter=200,       # Increase iterations
        max_depth=10,       # Allow deeper trees
        min_samples_leaf=5, # Avoid overfitting by requiring a minimum number of samples in leaves
        random_state=42
    )
    model.fit(X_train, y_train)
    return model

# Save Predictions
def save_predictions(model, X_test, test_data, output_file):
    """Make predictions and save them to a CSV file."""
    preds = model.predict(X_test)
    test_data['label'] = preds
    test_data = test_data.drop(columns=['image'])
    test_data.to_csv(output_file, index=False)

# Main Workflow
def main():
    start = time.time()
    
    print("Loading and preprocessing data...")
    concat_feature, train_data, test_data = preprocess_data("train.csv", "test.csv")
    print("Done preprocessing")

    print("Standardizing features...")
    concat_feature_sc = standardize_features(concat_feature)
    print("Done standardizing")

    print("Applying dimensionality reduction...")
    embedding = apply_dimensionality_reduction(concat_feature_sc, n_components=10)
    print("Done dimensionality reduction")

    # Combine original and reduced features
    concat_feature_sc_umap = np.hstack([concat_feature_sc, embedding])

    # Train the model
    print("Training the model...")
    model = train_model(concat_feature_sc_umap[:train_data.shape[0]], train_data['label'])
    print("Model training complete")

    # Save predictions
    print("Saving predictions to submit.csv...")
    test_features = concat_feature_sc[train_data.shape[0]:]
    test_embedding = embedding[train_data.shape[0]:]
    umap_x_embed = np.hstack([test_features, test_embedding])
    save_predictions(model, umap_x_embed, test_data, "submit.csv")
    print("Predictions saved successfully.")

    end = time.time()
    print("Total time taken: {:.2f} seconds".format(end - start))

# Run the workflow
if __name__ == "__main__":
    main()

Loading and preprocessing data...


100%|██████████| 60000/60000 [05:03<00:00, 198.00it/s]


Done preprocessing
Standardizing features...
Done standardizing
Applying dimensionality reduction...
Done dimensionality reduction
Training the model...
Model training complete
Saving predictions to submit.csv...
Predictions saved successfully.
Total time taken: 1449.74 seconds
