In [None]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import numpy.core.defchararray as np_f

def mix_aug_data(X_train, Y_train, AUG_NAME_MODIFIER):
    image_paths = X_train.flatten() 
    augmented_image_paths =    augmented_image_paths = np.array([path.replace('.jpg', AUG_NAME_MODIFIER) for path in image_paths])
    augmented_image_paths = augmented_image_paths.reshape(X_train.shape)
    result_X_train = np.concatenate((augmented_image_paths, X_train), axis=0)
    result_Y_train = np.concatenate((Y_train, Y_train), axis=0)
    return result_X_train, result_Y_train

def get_data(X):
    # Create a list to hold the loaded image arrays
    img_arrays = []
    for img in X:
        path = f"train_new_ims/{img[0]}"
        img = tf.keras.preprocessing.image.load_img(path)  # Load image
        img_array = tf.keras.preprocessing.image.img_to_array(img)  # Convert to array

        #flatten array
        #[[r,g,b],[r,g,b],...] => [r,g,b,r,g,b, ....]
        img_arrays.append(img_array.flatten())

    # Convert the list of arrays to a NumPy array
    img_arrays = np.array(img_arrays)
    print(len(img_arrays[0]))
    return img_arrays

df = pd.read_csv('./train.csv')
xs = np.array(df.iloc[:, :-1])
ys = np.array(df.iloc[:, -1])


# print(xs,ys)

outer_cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

knn = KNeighborsClassifier(p=2, metric='minkowski')
pipeline = make_pipeline(StandardScaler, PCA(n_components=0.95), knn)

# param_range = [1,5,10,25,50,100,200]
# param_grid = [{'kneighborsclassifier__n_neighbors': param_range}]
# gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=inner_cv, n_jobs=-1)
# scores = cross_val_score(gs, X=None, Y=None, scoring='accuracy', cv=outer_cv, n_jobs=-1)


for i, (train_index, test_index) in enumerate(outer_cv.split(xs,ys)):
    print(f"Fold {i}:")
    print(f"  Train: index={len(train_index)}")
    print(f"  Test:  index={len(test_index)}")
    X_train, X_test = xs[train_index], xs[test_index]
    Y_train, Y_test = ys[train_index], ys[test_index]
    print(X_train, Y_train)

    for i, (inner_train_index, inner_val_index) in enumerate(inner_cv.split(X_train,Y_train)):
        X_inner_train, X_inner_val = X_train[inner_train_index], X_train[inner_val_index]
        y_inner_train, y_inner_val = Y_train[inner_train_index], Y_train[inner_val_index]
        print(len(X_inner_train), len(y_inner_train))

        X_inner_train, y_inner_train = mix_aug_data(X_inner_train, y_inner_train, "_augmented.jpg")
        get_data(X_inner_train)
        break
    break



# print(f"Generalisation accuracy on average: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))


Fold 0:
  Train: index=43750
  Test:  index=6250
[['00016cd.jpg']
 ['0001808.jpg']
 ['0002399.jpg']
 ...
 ['d599cf6.jpg']
 ['d59a4c9.jpg']
 ['d59c7f5.jpg']] [6 2 1 ... 9 5 9]
32812 32812
65624 65624
3072
