In [0]:
# INSTALL REQUIREMENTS
!pip install -r requirements.txt

## **References**: 

Most of the IMDB-related code is sourced from [here](https://appliedmachinelearning.blog/2018/02/01/setting-up-deep-learning-in-windows-installing-keras-with-tensorflow-gpu/).

The CIFAR-10 related code is sourced from [here](https://github.com/abhijeet3922/Object-recognition-CIFAR-10/blob/master/cifar10.py)

** IMPORTANT NOTE** For whatever reason, the first time you run this code it will fail, complaining about some shape error. Simply run it again and it will function

In [2]:
# Imports
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
import keras.layers as layers
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils 
from keras.datasets import cifar10
from keras import backend as K
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import time
try:
    from google.colab import files
except ImportError:
  print("Not running on colab")
import scipy

Using TensorFlow backend.


# ** Choosing Dataset ** 

As in the other jupyter notebook, run one of the following two cells depending on which dataset you want to train on. Change the hyperparameter_configs variable if you would like to change the hyperparameters being tested

**CIFAR CELL**: Run this cell if you would like to train a neural network on the CIFAR dataset

In [11]:
# CIFAR CELL

##########################################
# Tweakable parameters (USE THESE A LOT) #
##########################################

VAL_OVER_TRAIN = .2
TEST_OVER_TOTAL = .4

PERCENT_DATASET = [.02, .1, .2, .3, .4, .5, .6, .7, .8]

hyperparam_configs = [
    #{"epochs": 10, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 50, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 100, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 10, "lrate": .01, "batch_size": 32, "momentum": 0.0},
    #{"epochs": 20, "lrate": .01, "batch_size": 32, "momentum": 0.1},
    #{"epochs": 50, "lrate": .01, "batch_size": 32, "momentum": 0.3},
    #{"epochs": 100, "lrate": .01, "batch_size": 32, "momentum": 0.5},
    #{"epochs": 10, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 50, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 100, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 10, "lrate": .05, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .05, "batch_size": 32, "momentum": 0.8},
    #{"epochs": 50, "lrate": .05, "batch_size": 32, "momentum": 0.9},
    {"epochs": 100, "lrate": .05, "batch_size": 32, "momentum": 0.9},
]

################################################
# Loading Data, split into test/train/val sets #
################################################

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
x_all, labels = np.concatenate([x_train, x_test]), np.concatenate([y_train, y_test])
labels = labels.ravel()

X_train_all, X_test, y_train_all, y_test = train_test_split(x_all, labels, 
                                                    test_size=TEST_OVER_TOTAL)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                     test_size=VAL_OVER_TRAIN)

####################
# Image Processing #
####################

K.set_image_dim_ordering('th')
seed = 7
np.random.seed(seed)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_valid = X_valid.astype('float32')
X_train_all = X_train_all.astype('float32')

X_train = X_train / 255.0
X_test = X_test / 255.0
X_valid = X_valid / 255.0
X_train_all = X_train_all / 255.0

y_train = np_utils.to_categorical(y_train)
y_train_all = np_utils.to_categorical(y_train_all)
y_test = np_utils.to_categorical(y_test)
y_valid = np_utils.to_categorical(y_valid)
num_classes = y_test.shape[1]

print("Shape of training data: ", X_train.shape)
print("Number of classes: ", y_test.shape[1])

##################
# MODEL CREATION #
##################

def create_model(lrate, momentum, batch_size, epochs):
  
    decay = lrate / epochs
    sgd = SGD(lr=lrate, momentum=momentum, decay=decay, nesterov=False)
    
    model = Sequential()
    model.add(Conv2D(32,(3,3), input_shape = (3,32,32), padding = 'same', activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Conv2D(32,(3,3), padding = 'same', activation = 'relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Conv2D(64,(3,3), padding = 'same', activation = 'relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(512,activation='relu',kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
    return model, {"epochs": epochs, "batch_size": batch_size}
  


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Shape of training data:  (28800, 32, 32, 3)
Number of classes:  10


** IMDB Cell** Run this cell if you would like to train on the IMDB dataset

In [6]:
# IMDB CELL
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import os.path
ONEHOTS_FILENAME = "imdb-onehots.gz"
LABELS_FILENAME = "imdb-labels.gz"

# Changeable parameters
MAX_WORD_FEATURES = 10000

VAL_OVER_TRAIN = .2
TEST_OVER_TOTAL = .4

PERCENT_DATASET = [.02, .1, .2, .3, .4, .5, .6, .7, .8]

hyperparam_configs = [
    #{"epochs": 10, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 50, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 100, "lrate": .01, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 10, "lrate": .01, "batch_size": 32, "momentum": 0.0},
    #{"epochs": 20, "lrate": .01, "batch_size": 32, "momentum": 0.1},
    #{"epochs": 50, "lrate": .01, "batch_size": 32, "momentum": 0.3},
    #{"epochs": 100, "lrate": .01, "batch_size": 32, "momentum": 0.5},
    #{"epochs": 10, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 50, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 100, "lrate": .02, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 10, "lrate": .05, "batch_size": 32, "momentum": 0.9},
    #{"epochs": 20, "lrate": .05, "batch_size": 32, "momentum": 0.8},
    #{"epochs": 50, "lrate": .05, "batch_size": 32, "momentum": 0.9},
    {"epochs": 100, "lrate": .05, "batch_size": 32, "momentum": 0.9},
]

def clean_text(raw_review):
    # Function to convert a raw review to a string of words
    
    # Import modules
    from bs4 import BeautifulSoup
    import re
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text() # Remove HTML
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) # Remove non-letters 
    words = letters_only.lower().split() # Convert to lower case, split into individual words
    stops = set(stopwords.words("english")) # Remove stop words (use of sets makes this faster)               
    meaningful_words = [w for w in words if not w in stops]                             
    porter = PorterStemmer() # Reduce word to stem of word
    stemmed_words = [porter.stem(w) for w in meaningful_words]
    joined_words = ( " ".join( stemmed_words )) # Join the words back into one string separated by space
    return joined_words 

def apply_cleaning_function_to_series(X):
    print('Cleaning data')
    start_time = time.time()
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    print ('Finished in ', str((time.time() - start_time)/60), " minutes")
    return cleaned_X

if not os.path.isfile(ONEHOTS_FILENAME + ".npz"):
    nltk.download('stopwords')
    print("one-hots not created yet: cleaning and saving to file")
    print("Expect this to take about 10-15 minutes")
    data = pd.read_csv('https://gitlab.com/michaelallen1966/00_python_snippets_and_recipes/raw/master/machine_learning/data/IMDb.csv')
    
    x_cleaned = apply_cleaning_function_to_series(data["review"])
    labels = np.array(data["sentiment"]).ravel()
    
    # Free up memory!
    data = None
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 ngram_range=(1,1),
                                 max_features=MAX_WORD_FEATURES)
    vectorizer.fit(x_cleaned)
    x_all = vectorizer.transform(x_cleaned)
    x_cleaned = None
    scipy.sparse.save_npz(ONEHOTS_FILENAME, x_all)
    np.savetxt(LABELS_FILENAME, labels)
else:
    print("loading one-hots from file")
    start_time = time.time()
    x_all = scipy.sparse.load_npz(ONEHOTS_FILENAME + ".npz")
    labels = np.loadtxt(LABELS_FILENAME)
    end_time = time.time()
    print("Finished loading one-hots in ", (end_time - start_time)/60, " minutes")
    

labels = labels.ravel()
X_train_all, X_test, y_train_all, y_test = train_test_split(x_all, labels, 
                                                    test_size=TEST_OVER_TOTAL)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                     test_size=VAL_OVER_TRAIN)
    

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_valid = X_valid.astype('float32')
X_train_all = X_train_all.astype('float32')

y_train = np_utils.to_categorical(y_train)
y_train_all = np_utils.to_categorical(y_train_all)
y_test = np_utils.to_categorical(y_test)
y_valid = np_utils.to_categorical(y_valid)
num_classes = y_test.shape[1]
    
def create_model(lrate, momentum, batch_size, epochs):
    model = Sequential()
    model.add(layers.Dense(50, activation = "relu", input_shape=(10000,)))
    # Hidden - Layers
    model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
    model.add(layers.Dense(50, activation = "relu"))
    model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
    model.add(layers.Dense(50, activation = "relu"))
    # Output- Layer
    model.add(layers.Dense(num_classes, activation = "sigmoid"))
    #model.summary()
              
    decay = lrate / epochs
    sgd = SGD(lr=lrate, momentum=momentum, decay=decay, nesterov=False)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model, {"epochs": epochs, "batch_size": batch_size}    

print("Full Data, Label shapes = ", x_all.shape, ", ", labels.shape)

# files.download(ONEHOTS_FILENAME + ".npz")
# files.download(LABELS_FILENAME)
# files.upload()

loading one-hots from file
Finished loading one-hots in  0.006836116313934326  minutes
Full Data, Label shapes =  (50000, 10000) ,  (50000,)


** Neural Network Training ** After having run the cell to select the relevant dataset, run the following cells to train neural networks on the selected


In [0]:
##################################
# EVALUATION OF MODELS AND STUFF #
##################################

print("TEST_OVER_TOTAL", TEST_OVER_TOTAL)
print("VAL_OVER_TRAIN", VAL_OVER_TRAIN)

#############################################
# MODEL CREATION FUNCTION, INTEGRATED LATER #
#############################################

def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.floor(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

model_list = [None] * len(hyperparam_configs)
model_history_list = [None] * len(hyperparam_configs)
val_score_list = [None] * len(hyperparam_configs)

for index, hyperparams in enumerate(hyperparam_configs):
  
    print("\n==============================================")
    print("Beginning to consider model ", index)
    print("Hyperparams: ", hyperparams)
    model, fit_args = create_model(**hyperparams)
    
    batch_size = fit_args["batch_size"]
    num_batches_per_epoch = np.floor(X_train.shape[0] / batch_size)
    
    start_time = time.time()
    #history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), 
    #                    verbose=1, **fit_args)
    history = model.fit_generator(batch_generator(X_train, y_train, batch_size, True),
                                  validation_data=(X_valid, y_valid),
                                  steps_per_epoch=num_batches_per_epoch,
                                  epochs=fit_args["epochs"], 
                                  verbose=0)
    end_time = time.time()
    print("Finished in ", (end_time - start_time)/60, " mins")
    
    scores = model.evaluate(X_valid, y_valid,verbose=0)
    print("Final train accuracy of ", history.history['acc'][-1])
    print("Final validation accuracy of ", scores)
    
    model_list[index] = model
    val_score_list[index] = scores[1]
    model_history_list[index] = history
    
print("Final Validation accuracies")
print(val_score_list)

**Runtime Diagnostics**

In [0]:
best_model_index = np.argmax(val_score_list)
history = model_history_list[best_model_index]
best_model = model_list[best_model_index]
best_hyperparams = hyperparam_configs[best_model_index]

print("Best model is index ", best_model_index)
print("Hyperparams for the best model were ", hyperparam_configs[best_model_index])

start_time = time.time()
test_score = model.evaluate(X_test, y_test,verbose=0)
end_time = time.time()
print("This model achieves a test accuracy of ", test_score)
print("Evaluation on the test set ran in ", (end_time - start_time)/60, " mins")

In [0]:
# Graph accuracy of best model over time
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Best model accuracy vs epochs')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.ylim(ymin=0)
plt.show()

In [0]:
# Graph loss history over time
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Best model loss vs epochs')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.ylim(ymin=0)
plt.show()

**Accuracy vs Dataset Size evaluation**: Finally, we train the model on different fractions of the (train) dataset and observe performance on the test dataset

In [0]:
# PERCENT_DATASET = [.02, .05]

percent_models = [None] * len(PERCENT_DATASET)
percent_model_histories = [None] * len(PERCENT_DATASET)

pm_final_train_accs = [None] * len(PERCENT_DATASET)
pm_final_test_accs = [None] * len(PERCENT_DATASET)

for index, percent_of_data in enumerate(PERCENT_DATASET):

    print("\n==============================================")
    print("Dataset percentage: ", percent_of_data)
    model, fit_args = create_model(**best_hyperparams)
    
    X_train_fractional, t_val, y_train_fractional, t_valor = train_test_split(X_train_all, y_train_all, 
                                                                             test_size=1-percent_of_data)
    
    print(X_train.shape)
    print(X_train_fractional.shape)

    start_time = time.time()
    model, fit_args = create_model(**best_hyperparams)

    history = model.fit(X_train_fractional, y_train_fractional, 
                        verbose=0,
                        validation_data=(X_valid, y_valid), **fit_args)
    end_time = time.time()
    print("Finished in ", (end_time - start_time)/60, " mins")

    train_acc = history.history['acc'][-1]
    test_score = model.evaluate(X_test, y_test, verbose=0)
    pm_final_train_accs[index] = train_acc
    pm_final_test_accs[index] = test_score[1]
    
    print("Final Train accuracy of ", train_acc)
    print("FInal test score of ", test_score)

    percent_models[index] = model
    percent_model_histories[index] = history
    
print("Final train accs: ", pm_final_train_accs)
print("Final test accs: ", pm_final_test_accs)

In [0]:
# Graph the performance over fraction of dataset used
plt.title("Accuracy vs Fraction of dataset")
plt.xlabel('Fraction of dataset used')
plt.ylabel('Accuracy (%)')
# plt.ylim(ymin=0)
plt.plot(PERCENT_DATASET, pm_final_train_accs, label="Train data")
plt.plot(PERCENT_DATASET, pm_final_test_accs, label="Test data")
plt.ylim(ymin=0)
plt.show()