# Setup the environment

In [None]:
INPUT='../input'
SYNIMG=f'{INPUT}/synimg'
OUTPUT='.'

**When not running in the competition's Kaggle kernel**

In [None]:
!kaggle competitions download -p $INPUT -c synthetic-image-classification
!unzip -u -d $SYNIMG $INPUT/synimg.zip

# Check out files

In [None]:
!ls $INPUT

In [None]:
!ls $SYNIMG/synimg

# Check out styles

In [None]:
import pandas as pd

styles = pd.read_csv(f'{SYNIMG}/synimg/styles.txt', names=['style_name'])

In [None]:
styles

# Check out training data

In [None]:
!ls $SYNIMG/synimg/train

In [None]:
import pandas as pd

train = pd.read_csv(f'{SYNIMG}/synimg/train/data.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.groupby('style_name')[['style_name']].count()

In [None]:
from IPython.display import Image

for style in styles['style_name']:
    display(style)
    for filepath in train[train['style_name'] == style]['filepath'][0:3]:
        display(Image(f'{SYNIMG}/{filepath}'))

# Check out test data

In [None]:
!ls $SYNIMG/synimg/test

In [None]:
import pandas as pd

test = pd.read_csv(f'{SYNIMG}/synimg/test/data_nostyle.csv')

In [None]:
test.head()

In [None]:
test.shape

In [None]:
from IPython.display import Image

for filepath in test['filepath'][0:3]:
    display(Image(f'{SYNIMG}/{filepath}'))

# Get image size

In [None]:
import keras

image = keras.preprocessing.image.load_img(f'{SYNIMG}/synimg/test/A/test-A-9000000.jpg')

In [None]:
image.height, image.width

In [None]:
image.getbands()

In [None]:
image_shape = (image.height, image.width, len(image.getbands()))

In [None]:
image_shape

# Create label encoder

In [None]:
import sklearn.preprocessing

label_encoder = sklearn.preprocessing.LabelBinarizer()
label_encoder.fit(styles['style_name'])

In [None]:
import numpy as np

display(label_encoder.classes_)
display(label_encoder.transform(['HongKong', 'Zurich', 'Syndey', 'Zurich']))
display(label_encoder.inverse_transform(np.array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])))

# Data processing functions

In [None]:
import numpy as np
import keras
import sklearn.model_selection

def load_image(filepath):
    return np.asarray(keras.preprocessing.image.load_img(f'{SYNIMG}/{filepath}')) / 255.0

def load_images(filepaths):
    return np.asarray([load_image(filepath) for filepath in filepaths])

def load_data(df):
    images = load_images(df['filepath'])
    labels = label_encoder.transform(df['style_name'])
    return sklearn.model_selection.train_test_split(images, labels, test_size=0.25)

In [None]:
import unittest

class TestLoadImages(unittest.TestCase):
    def test_load_image(self):
        result = load_image('synimg/test/A/test-A-9000000.jpg')
        self.assertTrue(isinstance(result, np.ndarray))
        self.assertEqual(result.dtype, 'float64')
        self.assertEqual(result.shape, image_shape)
        self.assertTrue((result >= 0.0).all() and (result <= 1.0).all())
    def test_load_images(self):
        result = load_images(['synimg/test/A/test-A-9000000.jpg', 'synimg/test/B/test-B-9000001.jpg'])
        self.assertEqual(result.shape, (2, *image_shape))

class TestLoadData(unittest.TestCase):
    def test_load_data(self):
        df = pd.DataFrame({
            'style_name': [
                'Luanda',
                'Luanda',
                'Brisbane',
                'Brisbane'
            ],
            'filepath': [
                'synimg/train/Luanda/train-Luanda-1000000.jpg',
                'synimg/train/Luanda/train-Luanda-1000001.jpg',
                'synimg/train/Brisbane/train-Brisbane-1090000.jpg',
                'synimg/train/Brisbane/train-Brisbane-1090001.jpg'
            ]
        })
        X_train, X_test, y_train, y_test = load_data(df)
        self.assertEqual((X_train.shape, y_train.shape), ((3, *image_shape), (3, len(styles))))
        self.assertEqual((X_test.shape, y_test.shape), ((1, *image_shape), (1, len(styles))))

unittest.main(argv=[''], exit=False)

# Load data

In [None]:
train_images, test_images, train_labels, test_labels = load_data(train)

# Create model

In [None]:
from keras import layers, models

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=image_shape))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Dropout(0.25))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(styles), activation='softmax'))

In [None]:
model.summary()

# Train model

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=20)

# Evaluate model

In [None]:
test_loss, test_acc = model.evaluate(test_images, test_labels)

In [None]:
test_acc

# Investigate performance

In [None]:
test_predictions = model.predict(test_images)

In [None]:
import pandas as pd

test_predictions_df = pd.DataFrame({
    'expected': label_encoder.inverse_transform(test_labels),
    'actual': label_encoder.inverse_transform(test_predictions)
})
pd.crosstab(test_predictions_df['expected'], test_predictions_df['expected'] == test_predictions_df['actual'], normalize='index')

# Run predictions

In [None]:
real_test_images = load_images(test['filepath'])
predictions = model.predict(real_test_images)
prediction_labels = label_encoder.inverse_transform(predictions)
prediction_labels

# Submit

In [None]:
!head $INPUT/sample_submission.csv

In [None]:
submission = test[['id']].assign(style_name = prediction_labels)
submission.head()

In [None]:
submission.to_csv(f'{OUTPUT}/submission.csv', index=False)

In [None]:
!head $OUTPUT/submission.csv

In [None]:
#!kaggle competitions submit -c synthetic-image-classification -f $OUTPUT/submission.csv -m ''