# Run3

Importing the packages

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import confusion_matrix
import seaborn as sn; sn.set(font_scale=1.4)
from sklearn.utils import shuffle           
import matplotlib.pyplot as plt             
import cv2                                 
import tensorflow as tf                
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from sklearn import decomposition

In [None]:
class_names = ['Mountain', 'Street', 'Coast', 'Forest', 'Highway', 'Office', 'OpenCountry', "bedroom", 'industrial', 'kitchen', 'livingroom', 'Insidecity', 'store', 'Suburb', 'TallBuilding']
class_names_label = {class_name:i for i, class_name in enumerate(class_names)}

nb_classes = len(class_names)

#The images seem to all be in this size but it's better to be sure
IMAGE_SIZE = (256, 256)

Loading the training data

In [None]:
def load_data():
    datasets = ['../input/training1/training', '../input/training2/training']
    output = []
    images = []
    labels = []
    # Iterate through training and test sets
    for dataset in datasets:    
        print("Loading {}".format(dataset))
        
        # Iterate through each folder corresponding to a category
        for folder in os.listdir(dataset):
            label = class_names_label[folder]
            
            # Iterate through each image in our folder
            for file in tqdm(os.listdir(os.path.join(dataset, folder))):
                
                # Get the path name of the image
                img_path = os.path.join(os.path.join(dataset, folder), file)
                
                # Open and resize the img
                image = cv2.imread(img_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, IMAGE_SIZE) 
                
                # Append the image and its corresponding label to the output
                images.append(image)
                labels.append(label)
                
    images = np.array(images, dtype = 'float32')
    labels = np.array(labels, dtype = 'int32')       
    output = (images, labels)

    return output

In [None]:
(images, labels) = load_data()

Splitting the data into training set and test set and scaling the data

In [None]:
train_images,test_images,train_labels,test_labels =train_test_split(images,labels,test_size=0.2)
train_images, train_labels = shuffle(train_images, train_labels, random_state=42)
train_images = train_images / 255.0 
test_images = test_images / 255.0

Extracting features from VGG16.

In [None]:
 vgg = VGG16(weights='imagenet', include_top=False)

Get features from VGG16

In [None]:
# tf.keras.applications.vgg16.preprocess_input(train_images)
# tf.keras.applications.vgg16.preprocess_input(test_images)
train_features = vgg.predict(train_images)
test_features = vgg.predict(test_images)

In [None]:
n_train, x, y, z = train_features.shape
n_test, x, y, z = test_features.shape
numFeatures = x * y * z

Visualising the data with principal componen analysis

In [None]:
pca = decomposition.PCA(n_components = 2) #this way I can draw it on a 2D plot

X = train_features.reshape((n_train, x*y*z))
pca.fit(X)

C = pca.transform(X) 
C1 = C[:,0]
C2 = C[:,1]

plt.subplots(figsize=(10,10))

for i, class_name in enumerate(class_names):
    plt.scatter(C1[train_labels == i][:1000], C2[train_labels == i][:1000], label = class_name, alpha=0.4)
plt.legend()
plt.title("PCA Projection")
plt.show()

Seems like clustering may not be enough.
Training a single neural network on top of the obtained features
 

In [None]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = (x, y, z)),
    tf.keras.layers.Dense(50, activation=tf.nn.relu),
    tf.keras.layers.Dense(15, activation=tf.nn.softmax)
])

model2.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

history = model2.fit(train_features, train_labels, batch_size=64, epochs=20, validation_split = 0.2) #batch size was 128  before

We should get approximately 0.844 accuracy (+0.1 accuracy) over the simple ConvNet.

In [None]:
test_loss = model2.evaluate(test_features, test_labels)

^0.59 accuracy

# Ensemble Neural Networks

In [None]:
np.random.seed(seed=42)
# Number of estimators
n_estimators = 10
# Proporition of samples to use to train each training
max_samples = 0.8

max_samples *= n_train
max_samples = int(max_samples)

We define n_estimators Neural Networks. 

Each Neural Network will be trained on random subsets of the training dataset. Each subset contains max_samples samples.

In [None]:
models = list()
random = np.random.randint(50, 100, size = n_estimators)

for i in range(n_estimators):
    
    # Model
    model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape = (x, y, z)),
                                # One layer with random size
                                    tf.keras.layers.Dense(random[i], activation=tf.nn.relu),
                                    tf.keras.layers.Dense(random[i] + 10,activation=tf.nn.relu),
                                    tf.keras.layers.Dense(random[i]- 10,activation=tf.nn.relu),
                                    tf.keras.layers.Dense(15, activation=tf.nn.softmax)
                                ])
    
    model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Store model
    models.append(model)

In [None]:
histories = []

for i in range(n_estimators):
    # Train each model on a bag of the training data
    train_idx = np.random.choice(len(train_features), size = max_samples)
    histories.append(models[i].fit(train_features[train_idx], train_labels[train_idx], batch_size=10, epochs=30, validation_split = 0.1))

We aggregate each model individual predictions to form a final prediction.

In [None]:
predictions = []
for i in range(n_estimators):
    predictions.append(models[i].predict(test_features))
    
predictions = np.array(predictions)
predictions = predictions.sum(axis = 0)
pred_labels = predictions.argmax(axis=1)

We should improve our result as we have a lower variance.

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy : {}".format(accuracy_score(test_labels, pred_labels)))

Loading the test data

In [None]:
folder ='../input/testing/testing'
testing_images = []
image_names = []
# Iterate through test images
            
# Iterate through each image in our folder
for file in tqdm(os.listdir(folder)):
                
    # Get the path name of the image
    img_path = os.path.join(folder, file)
                
    # Open and resize the img
    test_image = cv2.imread(img_path)
    test_image = cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB)
    test_image = cv2.resize(test_image, IMAGE_SIZE) 
    
    #add image to testing images
    testing_images.append(test_image)
    image_names.append(file)
                
testing_images = np.array(testing_images, dtype = 'float32')

Extracting features from testing images and making predictions

In [None]:
features = vgg.predict(testing_images)

predictions = []
for i in range(n_estimators):
    predictions.append(models[i].predict(features))
    
predictions = np.array(predictions)
predictions = predictions.sum(axis = 0)
pred_labels = predictions.argmax(axis=1)

Writing predictions to the .txt file

In [None]:
result = zip(image_names, pred_labels) #is this correct? Maybe (Update - it works!)
f = open('./run3.txt', 'w')
for name, label in result:
    f.write(name + ' ' + class_names[label])
    f.write('\n')
f.close()

In [None]:
for name in image_names:
    print(class_l)