In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
import tensorflow as tf

trainingArr = pd.read_csv('./train_midterm.csv')
testArr = pd.read_csv('./test_midterm.csv')

x_test = testArr.drop('label', axis=1).values
y_test = testArr['label'].values  # Add .values here
x_train = trainingArr.drop('label', axis=1).values
y_train = trainingArr['label'].values

allPossibleLearning_rate = [0.1, 0.01, 0.001]
allPossibleEpochs = [50, 200, 500]

# NEEDS TO ONLY RETURN LAST TRAIN LOSS AND VAL LOSS

def findLossForDataSet(xData, yData, weights):
    loss = 0
    for i in range(len(xData)):
        # Calculate prediction
        wx = sum(weights[j] * xData[i][j] for j in range(len(weights)))
        yHat = 1 / (1 + np.exp(-wx))
        
        # doing binary cross-entropy
        loss += -(yData[i] * np.log(yHat + 1e-10) + (1 - yData[i]) * np.log(1 - yHat + 1e-10))
    return loss / len(xData)

def perform_Logistic_Regression(xTrain, yTrain, xVal, yVal, epoch, learningRate):
    trainingLoss = []
    valLoss = []
    weights = [0] * len(x_test[0])
    for ep in range(epoch):
        for i in range(len(xTrain)):
            xRow = xTrain[i]

            # first finding the dot product of current weights and xRow
            beforeSigmoidYHat = np.dot(weights, xRow) 

            # turning this variable above to the actualyHat by using the sigmoid function, this is necessary in logistic regression since we need to convert any number to a percentage
            # this sigmoid function will convert any number (-inf, inf) -> [0,1]
            YHat = 1 / (1 + np.exp(-beforeSigmoidYHat))
            error = yTrain[i] - YHat

            for j in range(len(weights)):
                gradient = learningRate * error * xRow[j]
                weights[j] = weights[j] - gradient

        # now we find losses at the end of the epoch after changing weights for every parameter
        train_loss = findLossForDataSet(xTrain, yTrain, weights)
        val_loss = findLossForDataSet(xVal, yVal, weights)
        trainingLoss.append(train_loss)
        valLoss.append(val_loss)
    
    # returning only the last training and validation loss
    return trainingLoss[-1], valLoss[-1]




def perform_cross_validation(x_train, y_train, epoch, learningRate):
    print(f"\nLOOKING AT EPOCH = {epoch}, LEARNING_RATE = {learningRate} \n")
    fold_train_losses = []
    fold_val_losses = []
    kf = KFold(n_splits=5)
    
    
    for (train_index, val_index) in kf.split(x_train):
        x_train_fold = x_train[train_index]
        y_train_fold = y_train[train_index]
        x_val_fold = x_train[val_index]
        y_val_fold = y_train[val_index]

        # Find trainingloss and validation loss with given validation and training sets.
        trainLoss, valLoss = perform_Logistic_Regression(x_train_fold, y_train_fold, x_val_fold, y_val_fold, epoch, learningRate)
        fold_train_losses.append(trainLoss)
        fold_val_losses.append(valLoss)
    
    meanTrainLoss = np.mean(fold_train_losses)
    meanValLoss = np.mean(fold_val_losses)
    
    return meanTrainLoss, meanValLoss


#Doing my actual cross validation for every hyperparameter.
results = []

for learningRate in allPossibleLearning_rate:
    for epoch in allPossibleEpochs:
        #error
        curMeanTrainLoss , curMeanValLoss = perform_cross_validation(x_train, y_train, epoch, learningRate)
        
        results.append({'learningRate': learningRate, 'epoch': epoch, 'meanTrainLoss': curMeanTrainLoss, 'meanValLoss': curMeanValLoss})


#Print all of the final results for every pair of hyperparameters
print("\nFINAL RESULTS:\n")
for result in results:
    print(f"\n[Epoch: {result['epoch']}, Learning Rate: {result['learningRate']}")
    print(f"Average Train Loss: {result['meanTrainLoss']}")
    print(f"Average Validation Loss: {result['meanValLoss']}")

# Find best hyperparmeter based on our current results
# getting best hyperparameters by looking through every hyperparameters object storing its avgTraining and avg validation loss and finding the min.
curMinVal = float('inf')
curBestObj = None
for result in results:
    if result["meanValLoss"] < curMinVal:
        curMinVal = result["meanValLoss"]
        curBestObj = result

print("\nBEST HYPEPARAMETERS")
print(f"Epoch: {curBestObj['epoch']}")
print(f"Learning Rate: {curBestObj['learningRate']}")
print(f"Average Validation Loss: {curBestObj['meanValLoss']}")
print(f"Average Training Loss {curBestObj['meanTrainLoss']}")






        


LOOKING AT EPOCH = 50, LEARNING_RATE = 0.1 



KeyboardInterrupt: 