In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import os
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics import hamming_loss
import shutil

posters_dir = os.path.join('.', 'posters')
model_save_path = os.path.join('.', 'best_keras_model.h5py')

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [10]:
movies = pd.read_csv('extended_movie_data_with_local_files.csv', sep=';')
movies.head()
movies.shape

(26585, 9)

In [17]:
import importlib
import helpers
importlib.reload(helpers)

movies = helpers.filter_movies(movies)
movies.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  filtered_movies['release_year'] = pd.to_numeric(filtered_movies.release_year)


(10874, 9)

In [110]:
genres = movies.genres.tolist()
genres = list(map(lambda x: x.split('|'), genres))
label_binarizer = MultiLabelBinarizer()
label_binarizer.fit(genres)
print(genres[:2])
print(label_binarizer.transform(genres[:2]))
len(label_binarizer.classes_)

[['Comedy', 'Horror'], ['Drama', 'Sci-Fi']]
[[0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]]


18

In [111]:
X_train, X_validation = train_test_split(movies, test_size=0.1, random_state=1)
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=1)
X_train.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_img
131,131,1629,"MatchMaker, The (1997)",Comedy|Romance,119632,20457,1629.jpg
870,870,30808,It Happens Every Spring (1949),Comedy|Sci-Fi,41514,88288,30808.jpg
1692,1692,91697,Pitfall (1948),Film-Noir,40695,25688,91697.jpg
1984,1984,104625,Apartment for Peggy (1948),Drama,40104,218212,104625.jpg
1286,1286,71573,Whiteout (2009),Action|Crime|Drama|Mystery|Thriller,365929,22787,71573.jpg


In [112]:
length_of_genres = list(map(lambda x: len(x), genres))
genres_count_average = np.mean(length_of_genres)
print('Genre count average {}'.format(genres_count_average))
genres_count_variance = np.var(length_of_genres)
print('Genre count variance {}'.format(genres_count_variance))

Genre count average 1.9856
Genre count variance 1.07499264


In [113]:
genres_test = X_test.genres.tolist()
genres_test = list(map(lambda x: x.split('|'), genres_test))
labels_test = label_binarizer.transform(genres_test)
hamming_loss(labels_test, np.zeros(labels_test.shape))

0.10518518518518519

In [114]:
class RollingAverage:
    def __init__(self):
        self._values = []
    
    def put(self, value):
        self._values.append(value)
        
    def average(self):
        return np.mean(self._values)

In [115]:
batch_size = 20
img_width, img_height = 224, 224
#img_width, img_height = 299, 299

def batch_generator(dataframe, distortion=False):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
#         shear_range=0.2,
#         zoom_range=0.2,
#         horizontal_flip=False
    )   

    row_generator = dataframe.sample(frac=1).iterrows()
    
    for batch_count in range(int(dataframe.shape[0]/batch_size)):
        
        batch_img = []
        batch_labels = []
        batch_img_files = []
        
        for img_count in range(batch_size):
            _, row = next(row_generator)

            img_path = os.path.join(posters_dir, row['poster_img'])
            img = load_img(img_path, target_size=(img_width, img_height))
            img = img_to_array(img)
            if hasattr(img, 'close'):
                img.close()
            if distortion:
                img = train_datagen.random_transform(img)
            img = train_datagen.standardize(img)

            label_list = row['genres'].split('|')
            label = label_binarizer.transform([label_list])[0]

            batch_img.append(img)
            batch_labels.append(label)
            batch_img_files.append(row['poster_img'])

        yield [np.array(batch_img), np.array(batch_labels), batch_img_files]
            
    return

#print(X_train.shape)
#next(batch_generator(X_train))

In [116]:
from keras.models import Sequential, Model, Input
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import applications
from keras import optimizers
from keras.layers import concatenate

base_model = applications.VGG16(include_top=False, weights='imagenet', input_shape=(img_width, img_height, 3))
#base_model = applications.InceptionV3(include_top=False, weights='imagenet', input_shape=(img_width, img_height, 3))

print(base_model.input.name)

for layer in base_model.layers:
    layer.trainable = False

x = base_model.output

x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)

#fg_distribution = Input(shape=(len(label_binarizer.classes_),), name='fg_distribution')
#densities = Input(shape=(1,) , name='densities')
#x = concatenate([x, fg_distribution, densities])
#x = Flatten()(x)

predictions = Dense(len(label_binarizer.classes_), activation='sigmoid')(x)
model = Model(inputs=[base_model.input], outputs=predictions)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

input_5:0


In [117]:
def calc_hamming_loss(labels, batch):
    predictions_sigmoid = model.predict_on_batch(batch_inputs)
    predictions = np.where(predictions_sigmoid > 0.5, 1, 0)
    return hamming_loss(batch_labels, predictions)

x, labels, _ = next(batch_generator(X_train))
calc_hamming_loss(labels, x)

0.525

In [119]:
best_hammond_loss = -1
for epoch in range(5):
    
    count = 0
  
    hamming_loss_rolling = RollingAverage()
    loss_rolling = RollingAverage()

    for batch_inputs, batch_labels, _ in batch_generator(X_train, distortion=True):
        hamming = calc_hamming_loss(batch_labels, batch_inputs)
        hamming_loss_rolling.put(hamming)
        loss = model.train_on_batch(batch_inputs, batch_labels)
        loss_rolling.put(loss)
        count += 1
        if count % 10 == 0:
            print('Batches {}, Train Loss: {:.3f}, Hamming Loss: {:.3f}' \
                  .format(count, loss_rolling.average(), hamming_loss_rolling.average()))
    
    test_hamming_loss_rolling = RollingAverage()
    for test_batch_inputs, test_batch_labels, _ in batch_generator(X_validation, distortion=False):
        test_hamming_loss_rolling.put(calc_hamming_loss(test_batch_labels, test_batch_inputs))
        
    if test_hamming_loss_rolling.average() > best_hammond_loss:
        best_hammond_loss = test_hamming_loss_rolling.average()
        model.save(model_save_path)

    print('Epoch: {}, Hamming Loss: {:.3f}'
          .format(epoch, test_hamming_loss_rolling.average()))

Batches 10, Train Loss: 4.741, Hamming Loss: 0.124
Batches 20, Train Loss: 4.955, Hamming Loss: 0.125
Batches 30, Train Loss: 4.846, Hamming Loss: 0.128
Batches 40, Train Loss: 4.836, Hamming Loss: 0.131
Batches 50, Train Loss: 4.848, Hamming Loss: 0.130
Batches 60, Train Loss: 4.886, Hamming Loss: 0.130
Batches 70, Train Loss: 4.933, Hamming Loss: 0.131
Batches 80, Train Loss: 4.903, Hamming Loss: 0.130
Batches 90, Train Loss: 4.890, Hamming Loss: 0.130
Batches 100, Train Loss: 4.849, Hamming Loss: 0.127
Epoch: 0, Validation Loss: 0.094
Batches 10, Train Loss: 4.742, Hamming Loss: 0.123
Batches 20, Train Loss: 4.781, Hamming Loss: 0.125
Batches 30, Train Loss: 4.784, Hamming Loss: 0.123
Batches 40, Train Loss: 4.815, Hamming Loss: 0.123
Batches 50, Train Loss: 4.739, Hamming Loss: 0.122
Batches 60, Train Loss: 4.734, Hamming Loss: 0.121
Batches 70, Train Loss: 4.763, Hamming Loss: 0.121
Batches 80, Train Loss: 4.711, Hamming Loss: 0.120
Batches 90, Train Loss: 4.690, Hamming Loss: 0.1

In [120]:
from keras.models import load_model
loaded_model = load_model(model_save_path)

In [121]:
hamming = RollingAverage()
for batch_inputs, batch_labels, img_files in batch_generator(X_test, distortion=False):
    hamming.put(calc_hamming_loss(batch_labels, batch_inputs))
    
hamming.average()

0.10151515151515152

In [122]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def plot_img(img_file, teached_target, labels, result):
    plt.figure(figsize=(6,8))
    img=mpimg.imread(os.path.join(posters_dir, img_file))

    ax1 = plt.subplot(2, 1, 1)
    plt.imshow(img)


    ax = plt.subplot(2, 1, 2)
    plt.barh(range(len(labels)), list(result))
    plt.yticks(range(len(labels)), list(labels), fontsize=12)
    ax.set_xlim(right=1.0)

    plt.gcf().text(0, 1.05, 'Teached as {}'.format(teached_target), fontsize=18)
    plt.gcf().text(0, 1, os.path.basename(img_file), fontsize=10)
    plt.tight_layout()
    #plt.show()
    
    return plt

In [123]:
example_classifications_path = os.path.join('.', 'classification_examples')

if os.path.exists(example_classifications_path):
    shutil.rmtree(example_classifications_path)

if not os.path.exists(example_classifications_path):
    os.makedirs(example_classifications_path)

count = 0
for batch_inputs, batch_labels, img_files  in batch_generator(X_test, distortion=False):
    predictions = loaded_model.predict(batch_inputs)
    for i in range(batch_inputs.shape[0]):
        count += 1
        encoded_label = np.expand_dims(batch_labels[i], axis=0)
        teached_target = label_binarizer.inverse_transform(encoded_label)[0]
        prediction = predictions[i]
        plt = plot_img(img_files[i], teached_target, label_binarizer.classes_, prediction)
        plt.savefig(os.path.join(example_classifications_path, img_files[i]), bbox_inches='tight')
        plt.close()