In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def load_data_from_file(file='preprocessed_data_cropped.npz'):
    data = np.load(file)
    return data['X'], data['y']

# Load the data
X, y = load_data_from_file()

# Flatten the image data
X = X.reshape(X.shape[0], -1)

# Apply PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

# Convert one-hot encoded labels back to their original class numbers
y = np.argmax(y, axis=1)

# Split the data into training and testing sets
X_train = X[:39209]
y_train = y[:39209]
X_test = X[39209:]
y_test = y[39209:]


# Train logistic regression model
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
print("Iterations:", lr.n_iter_)
# Predict and evaluate accuracy
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

y_pred_train = lr.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)
print("difference between Train Accuracy and Test:", accuracy-accuracy_train)


"""
pca100
Iterations: [1481]
Test Accuracy: 0.8802058590657166
Train Accuracy: 0.9700578948710755
difference between Train Accuracy and Test: -0.08985203580535894
"""



In [4]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

def load_data_from_file(file='preprocessed_data_cropped.npz'):
    data = np.load(file)
    return data['X'], data['y']

# Load the data
X, y = load_data_from_file()

# Reshape the input data into a 2-dimensional array
X = X.reshape(X.shape[0], -1)
# Convert y to a 1D array
y = np.argmax(y, axis=1)
#only use train set to validation
X = X[:39209]
y = y[:39209]
def logistic_regression_kfold(X, y, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    accuracies = []
    accuracies_train = []
    iteration_counts=[]
    for train_index, validation_index in kfold.split(X):
        X_train, X_validation = X[train_index], X[validation_index]
        y_train, y_validation = y[train_index], y[validation_index]

        model = LogisticRegression(max_iter=2000)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_validation)
        accuracy = accuracy_score(y_validation, y_pred)
        accuracies.append(accuracy)
#check overfitting
        y_pred_train = model.predict(X_train)
        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracies_train.append(accuracy_train)
        iteration_counts.append(model.n_iter_)
        print("difference between Train Accuracy and validation:",  accuracy_train-accuracy )

    return  accuracies, iteration_counts,accuracies_train

# Train and evaluate the logistic regression model using k-fold cross-validation
accuracies, iteration_counts,accuracies_train = logistic_regression_kfold(X, y)


# Print the average F1 score, accuracy, and iteration count across all folds

print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average train Accuracy : {np.mean(accuracies_train):.4f}")
print(f"Average Iterations: {np.mean(iteration_counts):.1f}")
print(f"difference between Train Accuracy and validation: {np.mean(accuracies_train)-np.mean(accuracies):.1f}")


difference between Train Accuracy and validation: 0.0273205692231564
difference between Train Accuracy and validation: 0.02767126438166645


KeyboardInterrupt: 

cropped
Fold 1: F1 Score = 0.8107, Accuracy = 0.7741, Iterations = 970
Fold 2: F1 Score = 0.9139, Accuracy = 0.8615, Iterations = 1220
Fold 3: F1 Score = 0.8027, Accuracy = 0.7412, Iterations = 1877
Fold 4: F1 Score = 0.7743, Accuracy = 0.7682, Iterations = 1672
Fold 5: F1 Score = 0.9278, Accuracy = 0.9279, Iterations = 1123
Average F1 Score: 0.8459
Average Accuracy: 0.8146
Average Iterations: 1372.4
uncropped
Fold 1: F1 Score = 0.7336, Accuracy = 0.6851, Iterations = 1047
Fold 2: F1 Score = 0.8791, Accuracy = 0.8132, Iterations = 1416
Fold 3: F1 Score = 0.7441, Accuracy = 0.6760, Iterations = 1974
Fold 4: F1 Score = 0.6987, Accuracy = 0.6974, Iterations = 1773
Fold 5: F1 Score = 0.9023, Accuracy = 0.9026, Iterations = 1332
Average F1 Score: 0.7915
Average Accuracy: 0.7549
Average Iterations: 1508.4

In [3]:
import torch
print(torch.cuda.is_available())


False
