In [5]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
        
    def fit(self, X, y):
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel' ; got (vote=%r)" % self.vote)
        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers' % (len(self.weights), len(self.classifiers)))
    
        self.classifiers_ = []
        for clf in self.classifiers:
            binary_labels = (y == clf['label']).astype(int)
            fitted_clf = clone(clf['model']).fit(X, binary_labels)
            self.classifiers_.append({'model':fitted_clf, 'label': clf['label']})
        return self
    

    def predict(self, X):
        # print(X[0])
        # Using decision function for class predictions
        predictions = np.asarray([clf['model'].predict(X) for clf in self.classifiers_]).T
        maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions)
        print(predictions)
        print(maj_vote)
        return maj_vote
        print("Sample Predictions:", predictions)

        # Perform majority voting
        maj_vote.append(np.bincount(predictions).argmax())
        return maj_vote
    
    def evaluate(self, X_test, Y_test):
        y_predict = self.predict(X_test)
        return accuracy_score(Y_test, y_predict)


In [6]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score
import numpy.core.defchararray as np_f
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
from skopt import BayesSearchCV
from sklearn.multiclass import OneVsRestClassifier


def mix_aug_data(X_train, Y_train, AUG_NAME_MODIFIER):
    # print("\nMIXING AUGMENT\n")
    image_paths = X_train.flatten() 
    augmented_image_paths =    augmented_image_paths = np.array([path.replace('.jpg', AUG_NAME_MODIFIER) for path in image_paths])
    augmented_image_paths = augmented_image_paths.reshape(X_train.shape)
    result_X_train = np.concatenate((augmented_image_paths, X_train), axis=0)
    result_Y_train = np.concatenate((Y_train, Y_train), axis=0)
    return result_X_train, result_Y_train




def train_and_evaluate(X_train, Y_train, X_eval, Y_eval, model):
    """Train the KNN model and evaluate it on the validation set."""
    model.fit(X_train, Y_train)
    y_pred_validation = model.predict(X_eval)
    return accuracy_score(Y_eval, y_pred_validation)

def gridSearch(X_train, Y_train, X_eval, Y_eval):
    #grid search
    best_score = 0
    best_params = None
    # print(Y_eval)
    # print(binary_labels_eval)


    # Iterate over each parameter combination
    for params in param_combinations:
        # Create a new KNN model with the current parameters
        model = OneVsRestClassifier(SVC(**params))
        # Train the model
        model.fit(X_train, Y_train)
        # Validate the model
        y_pred_eval = model.predict(X_eval)
    
        validation_accuracy = accuracy_score(Y_eval, y_pred_eval)

        # Check if this is the best score
        if validation_accuracy > best_score:
            best_score = validation_accuracy
            best_params = params

    return best_score, best_params


def get_data_dict(X):

    print("\nFETCHING IMAGES FROM DIRECTORY\n")

    image_paths = [f"train_new_ims/{img[0]}" for img in X]
    img_dict = {}
    total_images = len(image_paths)

    def progress_callback(future):
        """Callback function to update progress."""
        nonlocal loaded_images
        loaded_images += 1
        percentage = (loaded_images / total_images) * 100
        print('\r\033[K', end='')
        print(f"\rProgress: {percentage:.2f}%", end='')

    loaded_images = 0

    # Use ThreadPoolExecutor to load images in parallel
    with ThreadPoolExecutor() as executor:
        # Submit tasks to load images
        futures = {executor.submit(load_image, path): path for path in image_paths}
        
        # Attach a callback to each future to update progress
        for future in futures:
            future.add_done_callback(progress_callback)

        # Collect results
        for future in as_completed(futures):
            img_path = futures[future]
            img_dict[img_path.split('/')[-1]] = future.result()  # Store the result in the dictionary
        
    print(len(list(img_dict.keys())))
    return img_dict




def load_image(path):
    img_ = tf.keras.preprocessing.image.load_img(path)  # Load image
    img_array = tf.keras.preprocessing.image.img_to_array(img_)  # Convert to array
    mean = np.mean(img_array)
    std_dev = np.std(img_array)
    img_array = (img_array - mean) / std_dev
    return img_array.flatten()




def get_data_from_dict(X_image_paths, all_images):
    # print("\nFETCHING IMAGES FROM DICTIONARY\n")
    # Extract the image keys (filenames) from X_image_paths
    img_keys = [img[0] for img in X_image_paths]
    # Fetch the corresponding pixel data from the dictionary
    img_arrays = [all_images[key] for key in img_keys if key in all_images]
    return np.array(img_arrays)








df = pd.read_csv('./train.csv')
xs = np.array(df.iloc[:, :-1])
ys = np.array(df.iloc[:, -1])
outer_cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=22)


param_grid = {
    'C': [0.01, 0.1, 0.2],  # Regularization parameter
    'kernel': ['rbf'],   # SVM kernel types
    'gamma': ['auto']      # Kernel coefficient
}

# Create a list of parameter combinations
param_combinations = [
    
    {'C': c, 'kernel': k, 'gamma': g}
    for c in param_grid['C']
    for k in param_grid['kernel']
    for g in param_grid['gamma']
]

#BAYESIAN
# param_space = {
#     'n_neighbors': (450, 500),  # Range of neighbors
#     'metric': ['minkowski'],
#     'p': (2, 5)  # 1 for Manhattan, 2 for Euclidean
# }




#########LOAD ALL IMAGES INTO MEMORY#############
#load all images into a dictionary for future access. Should be faster since images are loaded only once into memory.
all_images = get_data_dict(xs)

print(f"TOTAL SAMPLES: {len(xs)}")
total_accuracy = 0
number_of_test_rounds = 0
######### NESTED K FOLD ##############
######### TRAINING + TEST FOLDS ######
for i, (train_index, test_index) in enumerate(outer_cv.split(xs,ys)):
    #not pixels at this point, just image names.
    X_train, X_test = xs[train_index], xs[test_index]
    Y_train, Y_test = ys[train_index], ys[test_index]
    total_validation_accuracy = 0
    number_of_validation_rounds = 0

    print(f"OUTER SPLIT: training[{len(X_train)}]\ttesting[{len(X_test)}]\ttotal[{len(X_train) + len(X_test)}]")

    best_across_validation = {'best_score':None, 'best_params':None}

    ######### TRAINING + VALIDATION FOLDS ######
    for j, (inner_train_index, inner_val_index) in enumerate(inner_cv.split(X_train,Y_train)):

        X_inner_train, X_inner_val = X_train[inner_train_index], X_train[inner_val_index]
        y_inner_train, y_inner_val = Y_train[inner_train_index], Y_train[inner_val_index]
        print(f"\tINNER SPLIT: training[{len(X_inner_train)}]\tvalidation[{len(X_inner_val)}]\ttotal[{len(X_inner_train) + len(X_inner_val)}]")
        X_inner_train, y_inner_train = mix_aug_data(X_inner_train, y_inner_train, "_augmented.jpg")
        X_inner_train = get_data_from_dict(X_inner_train, all_images)
        X_inner_val = get_data_from_dict(X_inner_val, all_images)

        best_score, best_params = gridSearch(X_inner_train, y_inner_train, X_inner_val, y_inner_val)
        print(f"\tOuter Fold {i + 1}, Inner Fold {j + 1}, Best Accuracy: {best_score:.4f}")
        if (best_across_validation['best_score'] == None or best_across_validation['best_score'] < best_score):
            best_across_validation['best_score'] = best_score
            best_across_validation['best_params'] = best_params

        total_validation_accuracy += best_score
        number_of_validation_rounds += 1
    
    average_validation_acc = total_validation_accuracy/number_of_validation_rounds
    print(f"Average Validation Accuracy: {average_validation_acc:.4f}")

    X_train, Y_train = mix_aug_data(X_train, Y_train, "_augmented.jpg")
    X_train = get_data_from_dict(X_train, all_images)
    X_test = get_data_from_dict(X_test,all_images)
    best_model = OneVsRestClassifier(SVC(**best_across_validation['best_params']))
    best_model.fit(X_inner_train,y_inner_train)

    test_accuracy = train_and_evaluate(X_train, Y_train, X_test, Y_test, best_model)
    print(f"Outer Fold {i+1} Accuracy: {test_accuracy:.4f}\n\n")
    total_accuracy += test_accuracy
    number_of_test_rounds +=1




average_test_acc = total_accuracy/number_of_test_rounds
print(f"Average Test Accuracy: {average_test_acc:.4f}") 





FETCHING IMAGES FROM DIRECTORY

Progress: 100.00%50000
TOTAL SAMPLES: 50000
OUTER SPLIT: training[43750]	testing[6250]	total[50000]
	INNER SPLIT: training[32812]	validation[10938]	total[43750]


ValueError: Found input variables with inconsistent numbers of samples: [32812, 65624]