# DS 6050 Final Project 10 Fold CV Code
### Connie Cui

## Set up

Need the following files:
- All_folds.csv
- test_data.csv
- test_labels.csv
- train_data.csv
- train_labels.csv

In [1]:
### Load necessary libraries ###
import glob
import os
import librosa
import librosa.display
import skimage
import numpy as np
import pandas as pd

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout
from tensorflow.keras.utils import to_categorical 

In [8]:
train_data = pd.read_csv("train_data.csv", header = None)
test_data = pd.read_csv("test_data.csv", header = None)
train_lab = pd.read_csv("train_labels.csv", header = None)
test_lab = pd.read_csv("test_labels.csv", header = None)

In [None]:
len(train_data)+len(test_data)
# should have 8732, need to make sure we dont have one of the rows as the header 
# since we are importing what was previously a np array to csv/df

8730

In [13]:
X=pd.concat([train_data, test_data]) # .to_numpy().reshape(8732, 40, 5, 1)
Y=pd.concat([train_lab,test_lab]) # .to_numpy().reshape(8732,)

In [14]:
folds = pd.read_csv("All_folds.csv", header=None)
folds.columns = ['folds']
folds['folds'] = folds['folds'].astype('int')

In [15]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []

for i in range(1, 11):  # 1-10 to match values in folds df
  # obtain train and test indices
  test_idx = list(np.where(folds['folds'] == i)[0]) # find all indices for fold i and set as test indices
  train_idx = list(np.setdiff1d(folds.index.to_numpy(), np.where(folds['folds'] == i)[0])) # everything except fold i as train indices
  
  # use train and test indices to create train and test x/y and reshape them for training our model
  x_train = X.iloc[train_idx].to_numpy().reshape(len(train_idx), 40, 5, 1)
  y_train = Y.iloc[train_idx].to_numpy().reshape(len(train_idx),)
  x_test = X.iloc[test_idx].to_numpy().reshape(len(test_idx), 40, 5, 1)
  y_test = Y.iloc[test_idx].to_numpy().reshape(len(test_idx),)

  # insert model architecture here (just put the alexnet for reference, will need to update this for each model)
  pool_size = (2, 2)
  kernel_size = (3, 3)
  input_shape = (40, 5, 1)
  num_classes = 10

  model = Sequential()
  model.add(Conv2D(64, (3, 3), padding = "same", activation = "tanh", input_shape = input_shape))
  model.add(MaxPool2D(pool_size=(2, 2)))
  model.add(Dropout(0.1))
  model.add(Conv2D(128, (3, 3), padding = "same", activation = "tanh"))
  model.add(MaxPool2D(pool_size=(2, 2)))
  model.add(Dropout(0.1))
  model.add(Flatten())
  model.add(Dense(1024, activation = "tanh"))
  model.add(Dense(10, activation = "softmax"))

  optimizer = keras.optimizers.Adam(lr=1e-4)
  model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
  model.fit(x_train, y_train, epochs = 50, batch_size = 50, validation_data = (x_test, y_test))


  # add accuracy to our accuracies list for comparison
  accuracies.append(model.evaluate(x_test, y_test)[1])
  # index 0 is validation loss in final epoch, index 1 is validation accuracy in final epoch
  # if we try and look into other metrics, they may be in a different index and 
  # we would need ot make a separate list for those metrics



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [16]:
fold_acc = pd.DataFrame(list(zip(range(1,11), accuracies)),
                        columns =['Folds', 'Validation Accuracy']) 

fold_acc

## if looking into multiple models, make sure to create multiple accuracy lists and can add them in this 
# dataframe as well for side by side comparison of diff models and their accuracies by fold

Unnamed: 0,Folds,Validation Accuracy
0,1,0.562428
1,2,0.609234
2,3,0.552432
3,4,0.580808
4,5,0.641026
5,6,0.578372
6,7,0.584726
7,8,0.566998
8,9,0.689951
9,10,0.671446


In [17]:
fold_acc['Validation Accuracy'].mean()

0.6037420392036438