# Important notes: 
1. Some cells should not be used together. These are clearly marked. If it says use it for the first test data set, that means it will use the data set we had before the 7th of June. 
2. If it says use if for the second test data set, that means it will use the data set given to us at 7th of June.


3. We have our best model, named my_model 7th june.h5 saved on kaggle and uploaded to sucourse.
4. Using model training cell and model loading cell will delete the model of whichever cell was used first.

# Imports

In [None]:
# Run this first for all the necessary imports
import numpy as np
import pandas as pd
import os
import random

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Convolution2D, MaxPool2D, Dropout, Flatten, Activation
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# File Paths

In [None]:
# Run this first for the file paths.
# my_model7thjune.h5 is used in the best submission with 0.99954 score

training_path = '../input/cs412-spring-2021/train_data/train/'

test_path = '../input/cs412-spring-2021/test_data/'
test_path2 = '../input/cs412-spring-2021/'

model_path = '../input/my-model-7thjune/my_model7thjune.h5/'


# Label Names

In [None]:
# Run this so the prediction.csv labels are named properly
# This is a dict comprehension to get label names
# label names should be A, B, space, del etc

labelDict = {idx: v for idx, v in enumerate(sorted(os.listdir(os.path.join(training_path))))}
print(labelDict)


# Hyperparameters and some other variables

In [None]:
# These are the hyperparameters and other values that we can change

batch_size = 64

# 0.15 worked much better than 0.1, 0.2, 0.25
validation_split = 0.15

# using a patience of 3, 5, 7 wasn't as good as 10.
es_patience = 10

# Anything above 64 results in really long training time. Keep as is.
target_size = (64, 64)
input_shape = (64, 64, 3)

# After trying epoch values of 10, 20, 25 etc 100 seems to be the best
# Early stopping usually stops it around 40 - 50 epochs
epochs = 100

# Didn't have to fiddle with these too much.
# Tried these the first time and they worked out fine
filters1 = 32
filters2 = 64
filters3 = 128

# After switching kernel sizes from (3,3) and (2,2) to their current values,
# there was an improvement to the accuracy
kernel_size1 = (5, 5)  # (3, 3)
kernel_size2 = (3, 3)  # (2, 2)

# The initial pool size I used. Didn't have to change it too much
pool_size = (2, 2)

# Tried out many many learning rate values.
# 0.001 was way too high. 0.0001 was too low.
# 0.0006 seems to be a good mid point for them
learning_rate = 0.0006  # Maybe try different LR but should be fine

# Tried dropout probability of 0.5
# but after some testing, 0.4 gave out better results.
dropout_probability = 0.4


# Data Generator

In [None]:
# This is needed for training, validation and test generators.
data_generator = ImageDataGenerator(
        rescale = 1./255,# Rescale the images as per usual
        validation_split = validation_split,
        )

# Training and Validation Generator

In [None]:
# These are needed ONLY TO TRAIN THE MODEL
# If using the saved model there is no need to run them.

training_generator = data_generator.flow_from_directory(
        training_path,
        target_size = target_size,
        batch_size = batch_size,
        shuffle = True,
        class_mode = 'categorical',
        subset = "training" 
)

validation_generator = data_generator.flow_from_directory(
        training_path,
        target_size = target_size,
        batch_size = batch_size,
        shuffle = True,
        class_mode = 'categorical',
        subset = "validation"
)

# **USE EITHER MODEL TRAINING CELL OR MODEL LOADING CELL.**

# Model Training

In [None]:
# No need to use this anymore since a model with a great validation and test accuracy is saved already

# Load the my_model 7th june instead.
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, restore_best_weights = True, patience = es_patience)

model  =  Sequential()

model.add(Convolution2D(filters1, 
                      kernel_size1, 
                      padding = 'same', 
                      input_shape = input_shape, 
                      activation = 'relu'))
model.add(Convolution2D(filters1, 
                      kernel_size1, 
                      padding = 'same', 
                      activation = 'relu'))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_probability))

model.add(Convolution2D(filters2, 
                      kernel_size2, 
                      padding = 'same', 
                      activation = 'relu'))
model.add(Convolution2D(filters2, 
                      kernel_size2, 
                      padding = 'same', 
                      activation = 'relu'))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_probability))

model.add(Convolution2D(filters3, 
                      kernel_size2, 
                      padding = 'same', 
                      activation = 'relu'))    
model.add(Convolution2D(filters3, kernel_size2, 
                      padding = 'same', 
                      activation = 'relu'))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Flatten())
model.add(Dense(batch_size, activation = 'relu')) # change
model.add(Dropout(dropout_probability))

model.add(Dense(29, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', 
            optimizer = optimizers.Adam(lr = learning_rate), 
            metrics  =  ['accuracy'])

history  =  model.fit(training_generator, 
                    epochs = epochs, 
                    validation_data = validation_generator, 
                    use_multiprocessing = True, 
                    callbacks=[es])


# Model loading

In [None]:
# Only run this if we're not training a new model
# Load our saved model.

model = tf.keras.models.load_model('../input/my-model-7thjune/my_model 7th june.h5')


# Test generator for the FIRST TEST DATA SET

In [None]:
# THIS IS FOR THE FIRST TEST DATA SET. 
# FOR THE SECOND TEST DATA SET RUN THE NEXT CELL

test_generator = data_generator.flow_from_directory(
        directory = test_path,
        target_size = target_size,
        batch_size = batch_size,
        class_mode = 'categorical',
        shuffle = False
)


# Test generator for the SECOND TEST DATA SET

In [None]:
# THIS IS FOR THE SECOND TEST DATA SET. 
# FOR THE FIRST TEST DATA SET RUN THE PREVIOUS CELL


# flow_from_directory worked really well with the initial test and training data.
# However the second test data not having a subfolder required some adjustment

test_generator = data_generator.flow_from_directory(
        directory = test_path2,
        classes = ['nonlabeled-test2'],
        target_size = target_size,
        batch_size = batch_size,
        class_mode = 'categorical',
        shuffle = False
)


# Calculate our predictions on the test data set

In [None]:
test_generator.reset()
pred = model.predict(test_generator, steps = len(test_generator), verbose = 1)
class_index = np.argmax(pred, axis = 1)


# Create an output csv for the FIRST TEST DATA SET

In [None]:
# THIS IS FOR THE FIRST TEST DATA SET. 
# FOR THE SECOND TEST DATA SET RUN THE NEXT CELL
# If this is used for 2nd data set, all the ids will be 2
# This is because of how the regex was used to get id.

results = pd.DataFrame(
    {
        'id': pd.Series(test_generator.filenames), 
        'Prediction': pd.Series(class_index)
    })
results['id'] = results.id.str.extract('(\d+)')
results['id'] = pd.to_numeric(results['id'], errors = 'coerce')
results.sort_values(by='id', inplace = True)
results = results.rename(columns={'id': 'Id'})
results.to_csv('submissionInt.csv', index=False)

data = pd.read_csv('submissionInt.csv', header=0)


# Create an output csv for the SECOND TEST DATA SET

In [None]:
# THIS IS FOR THE SECOND TEST DATA SET. 
# FOR THE FIRST TEST DATA SET RUN THE PREVIOUS CELL
# This is because of how the regex was used to get id.

results = pd.DataFrame(
    {
        'id': pd.Series(test_generator.filenames), 
        'Prediction': pd.Series(class_index)
    })

# results['id'] = results.id.str.extract('(\d+)')
# Need to change this to fit the new folder name.'(\d+)' finds the first match of 1 or more digits
# With the new folder name being nonlabeled-test2, all prediction.csv id's end up being 2
# Using this regex, we will ignore the first digit, which will always be 2 and get the second one

results['id'] = results.id.str.extract('[^\d]*[\d]+[^\d]+([\d]+)')

results['id'] = pd.to_numeric(results['id'], errors = 'coerce')
results.sort_values(by='id', inplace = True)
results = results.rename(columns={'id': 'Id'})
results.to_csv('submissionInt.csv', index=False)
results.head()

data = pd.read_csv('submissionInt.csv', header=0)


In [None]:
results.head()


In [None]:
# predictions.csv is the output file we use for leaderboard

data.Prediction = [labelDict[item] for item in data.Prediction]
data.to_csv('predictions.csv', index=False)
data.head()


This is for saving a model

In [None]:
# Use in case the trained model turns out to be a good one

model.save('my_model.h5')
