In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import os
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics import hamming_loss
import shutil
from IPython.display import display
import importlib
import time
import datetime

posters_dir = os.path.join('.', 'posters')

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

gpu_devices = [device for device in device_lib.list_local_devices() if device ]

if len(gpu_devices) == 0:
    raise ValueError()

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 18027868259578936892
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3211018240
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5357067110326638216
physical_device_desc: "device: 0, name: Quadro M2200, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


In [2]:
import helpers
importlib.reload(helpers)

def split_and_sort(x):
    l = x.split('|')
    l.sort()
    return l

def read_movies_data():
    return pd.read_csv('extended_movie_data_with_local_files.csv', sep=';')

def filter_and_enrich_movies_data(movies):
    movies = helpers.filter_movies(movies)
    movies['release_year'] = pd.to_numeric(movies.release_year)
    movies.dropna(inplace=True)
    movies['genres'] = movies['genres'].apply(split_and_sort)
    return movies

movies = read_movies_data()
print('Shape original: {}'.format(movies.shape))
movies = filter_and_enrich_movies_data(movies)
print('Shape filtered: {}'.format(movies.shape))

movies.head()

Shape original: (26585, 9)
Shape filtered: (10813, 9)


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,release_year,poster_url,language,local_poster_file
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",114709,862,1995,https://image.tmdb.org/t/p/w300/rhIRbceoE9lR4v...,en,1.jpg
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",113497,8844,1995,https://image.tmdb.org/t/p/w300/vgpXmVaVyUL7GG...,en,2.jpg
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",113228,15602,1995,https://image.tmdb.org/t/p/w300/6ksm1sjKMFLbO7...,en,3.jpg
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",114885,31357,1995,https://image.tmdb.org/t/p/w300/16XOMpEaLWkrcP...,en,4.jpg
4,5,Father of the Bride Part II (1995),[Comedy],113041,11862,1995,https://image.tmdb.org/t/p/w300/e64sOI48hQXyru...,en,5.jpg


In [3]:
from collections import Counter
def calc_genre_counts(genres_per_movie):
    flat_list = [genre for movie_genres in genres_per_movie for genre in movie_genres]
    genre_counts = Counter(flat_list)
    df = pd.DataFrame(list(genre_counts.items()), columns=['genre', 'count'])
    df = df.sort_values(['count'], ascending=False)
    df['distribution'] = df['count']/df['count'].sum()
    return df

genre_counts = calc_genre_counts(movies.genres)
genre_counts

Unnamed: 0,genre,count,distribution
6,Drama,4784,0.222109
3,Comedy,3482,0.16166
9,Thriller,2170,0.100747
15,Documentary,1695,0.078694
5,Romance,1467,0.068109
7,Action,1417,0.065788
10,Horror,1151,0.053438
8,Crime,1141,0.052974
0,Adventure,823,0.03821
12,Sci-Fi,661,0.030689


In [4]:
def drop_rare_genres(df):
    movies = df.copy(deep=True)
    to_remove = set(genre_counts[genre_counts['count'] < 1000]['genre'])
    print('Genres to drop: {}'.format(to_remove))
    movies['relevant_genres'] = movies['genres'].apply(lambda x: [genre for genre in x if genre not in to_remove])
    cleaned_genre_movies = movies[movies.relevant_genres.map(len) > 0]
    return cleaned_genre_movies

drop_rare_genres(movies).shape

Genres to drop: {'Sci-Fi', 'Musical', 'Western', 'War', 'Adventure', 'Mystery', 'Film-Noir', 'Animation', 'Children', 'Fantasy'}


(10502, 10)

In [5]:
movies = drop_rare_genres(movies)
calc_genre_counts(movies.relevant_genres)

Genres to drop: {'Sci-Fi', 'Musical', 'Western', 'War', 'Adventure', 'Mystery', 'Film-Noir', 'Animation', 'Children', 'Fantasy'}


Unnamed: 0,genre,count,distribution
2,Drama,4784,0.27642
0,Comedy,3482,0.20119
5,Thriller,2170,0.125383
7,Documentary,1695,0.097937
1,Romance,1467,0.084763
3,Action,1417,0.081874
6,Horror,1151,0.066505
4,Crime,1141,0.065927


In [6]:
label_binarizer = MultiLabelBinarizer()
label_binarizer.fit(movies['relevant_genres'])

print('Count of classes: {}'.format(len(label_binarizer.classes_)))

def print_label_bin_example():
    print('\nExample: ')
    example = [movies.iloc[2]['relevant_genres']]
    print(example)
    print(label_binarizer.transform(example))

print_label_bin_example()

Count of classes: 8

Example: 
[['Comedy', 'Drama', 'Romance']]
[[0 1 0 0 1 0 1 0]]


In [7]:
def print_dataset_statistics(df):
    lengths_of_genres = df['relevant_genres'].apply(lambda x: len(x))
    genres_count_average = np.mean(lengths_of_genres)
    print('Genre count average {}'.format(genres_count_average))
    genres_count_variance = np.var(lengths_of_genres)
    print('Genre count variance {}'.format(genres_count_variance))
    
print_dataset_statistics(movies)

Genre count average 1.6479718148924014
Genre count variance 0.6053658160024004


In [8]:
X_train, X_validation = train_test_split(movies, test_size=0.1, random_state=1)
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=1)

In [9]:
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

def calc_performance_metric(y_true, y_predicted):
    return roc_auc_score(y_true, y_predicted, average='macro')

def calc_optimal_thresholds(y_true, y_predicted):
    optimal_thresholds = []
    
    for i in range(y_true.shape[1]):
        column_true = y_true[:, i]
        column_predicted = y_predicted[:, i]
        fpr, tpr, thresholds = roc_curve(column_true, column_predicted)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        optimal_thresholds.append(optimal_threshold)
        
    return optimal_thresholds

def test_optimal_thresholds_calc():
    optimal_thresholds = calc_optimal_thresholds(np.array([[1], [0]]), np.array([[0.61], [0.59]]))
    assert len(optimal_thresholds) == 1, 'Len: {}'.format(len(optimal_thresholds))
    assert 0.59 < optimal_thresholds[0] <= 0.61 , str(optimal_thresholds)
    
test_optimal_thresholds_calc()

In [10]:
def calc_mean_accuracy(y_true, y_predicted_values, thresholds):
    y_predicted = np.zeros(y_true.shape)
    for column in range(y_true.shape[1]):
        y_predicted[:, column] = (y_predicted_values[:, column] > thresholds[column]).astype(int)
       
    return np.mean([accuracy_score(y_true[:, col], y_predicted[:, col]) for col in range(y_true.shape[1])])

assert calc_mean_accuracy(np.array([[1, 1], [0, 1]]), np.array([[1, 1], [1, 1]]), [.5, .5]) == 0.75

In [11]:
def calc_model_performance(labels, predictions):
    performance_metric = calc_performance_metric(labels, predictions)
    optimal_thresholds = calc_optimal_thresholds(labels, predictions)
    mean_accuracy = calc_mean_accuracy(labels, predictions, optimal_thresholds)
    return performance_metric, optimal_thresholds, mean_accuracy

In [12]:
img_width, img_height = 224, 224
#img_width, img_height = 299, 299

def batch_generator(dataframe, distortion=False, batch_size=20):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
#         shear_range=0.2,
#         zoom_range=0.2,
#         horizontal_flip=False
    )   
    
    row_generator = dataframe.sample(frac=1).iterrows()
    
    for batch_count in range(int(dataframe.shape[0]/batch_size)):
        
        batch_img = []
        batch_labels = []
        batch_img_files = []
        
        #start_time = time.time()
        
        for img_count in range(batch_size):
            _, row = next(row_generator)

            img_path = os.path.join(posters_dir, row['local_poster_file'])
            img = load_img(img_path, target_size=(img_width, img_height))
            img = img_to_array(img)
            if hasattr(img, 'close'):
                img.close()
            if distortion:
                img = train_datagen.random_transform(img)
            img = train_datagen.standardize(img)

            label = label_binarizer.transform([row['relevant_genres']])[0]

            batch_img.append(img)
            batch_labels.append(label)
            batch_img_files.append(row['local_poster_file'])

        yield [np.array(batch_img), np.array(batch_labels), batch_img_files]
            
    return

#print(X_train.shape)
#next(batch_generator(X_train))

In [23]:
from keras.models import Sequential, Model, Input
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import applications
from keras import optimizers
from keras.layers import concatenate

from bp_mll_keras import bp_mll_loss

def create_model():
    base_model = applications.VGG16(include_top=False, weights='imagenet', input_shape=(img_width, img_height, 3))
    #base_model = applications.InceptionV3(include_top=False, weights='imagenet', input_shape=(img_width, img_height, 3))

    #print(base_model.input.name)

    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output

    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)

    #tags = Input(shape=(1,) , name='tags')
    #x = concatenate([x, tags])
    #x = Flatten()(x)

    predictions = Dense(len(label_binarizer.classes_), activation='sigmoid')(x)
    model = Model(inputs=[base_model.input], outputs=predictions)

    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    #model.compile(optimizer='rmsprop', loss=bp_mll_loss)
    
    return model
    
model = create_model()

In [24]:
def calc_model_predictions(model, data):
    inputs, y_true, _ = next(batch_generator(data, batch_size=data.shape[0], distortion=False))
    return y_true, model.predict(inputs)

def print_model_performance(model, data):
    y_true, predictions = calc_model_predictions(model, X_validation)
    calc_model_performance(y_true, predictions)
    performance_metric, optimal_thresholds, mean_accuracy = calc_model_performance(y_true, predictions)
    print("Performance: {}, Mean Accuracy: {}".format(performance_metric, mean_accuracy))
    
print_model_performance(model, X_test)

Performance: 0.5062896768965802, Mean Accuracy: 0.3554947668886774


In [25]:
best_performance_metric = -1
model_save_path = os.path.join('.', 'models', str(int(time.time())))
os.makedirs(model_save_path, exist_ok=True)
best_model_path = os.path.join(model_save_path, 'best_keras_model.h5py')

start_time = time.time()

for epoch in range(5):
    
    count = 0
  
    performance_metrics = []
    losses = []

    for batch_inputs, batch_labels, _ in batch_generator(X_train, distortion=True):
        loss = model.train_on_batch(batch_inputs, batch_labels)
        losses.append(loss)

        count += 1
        if count % 10 == 0:
            y_true_train, y_pred_train = calc_model_predictions(model, X_train.sample(frac=0.02, random_state=2))
            performance_train, optimal_thresholds, mean_accuracy_train = calc_model_performance(y_true_train, y_pred_train)
            
            runtime = str(datetime.timedelta(seconds=int(time.time()-start_time)))
            print('{} Batches {}, Train Loss: {:.3f}, Performance metric: {:.3f}, Mean Accuracy: {}' \
                  .format(runtime, count, np.mean(losses), performance_train, mean_accuracy_train))
            
        if count % 100 == 0:
            y_true_val, y_pred_val = calc_model_predictions(model, X_validation)
            performance_val, optimal_thresholds, mean_accuracy_val = calc_model_performance(y_true_val, y_pred_val)
            
            print('VALIDATION: Performance metric: {:.3f}, Mean Accuracy: {}'.format(performance_val, mean_accuracy_val))
            
            interim_model_path = os.path.join(model_save_path, \
                                              'batch-{}-performance-{}'.format(count, int(performance_val*1000)))
            
        
            if performance_val > best_performance_metric:
                best_performance_metric = performance_val
                model.save(best_model_path)

0:00:11 Batches 10, Train Loss: 3.124, Performance metric: 0.627, Mean Accuracy: 0.6176470588235294
0:00:21 Batches 20, Train Loss: 2.998, Performance metric: 0.684, Mean Accuracy: 0.6529411764705882
0:00:31 Batches 30, Train Loss: 2.985, Performance metric: 0.694, Mean Accuracy: 0.6323529411764706
0:00:39 Batches 40, Train Loss: 2.991, Performance metric: 0.701, Mean Accuracy: 0.7036764705882353
0:00:47 Batches 50, Train Loss: 3.009, Performance metric: 0.700, Mean Accuracy: 0.6279411764705882
0:00:55 Batches 60, Train Loss: 2.996, Performance metric: 0.706, Mean Accuracy: 0.6580882352941176
0:01:03 Batches 70, Train Loss: 3.016, Performance metric: 0.710, Mean Accuracy: 0.6419117647058823
0:01:11 Batches 80, Train Loss: 3.044, Performance metric: 0.715, Mean Accuracy: 0.6845588235294118
0:01:19 Batches 90, Train Loss: 3.051, Performance metric: 0.719, Mean Accuracy: 0.6977941176470588
0:01:27 Batches 100, Train Loss: 3.048, Performance metric: 0.720, Mean Accuracy: 0.7213235294117648

0:15:18 Batches 360, Train Loss: 2.796, Performance metric: 0.778, Mean Accuracy: 0.7147058823529412
0:15:27 Batches 370, Train Loss: 2.802, Performance metric: 0.780, Mean Accuracy: 0.7080882352941177
0:15:35 Batches 380, Train Loss: 2.799, Performance metric: 0.779, Mean Accuracy: 0.6970588235294117
0:15:43 Batches 390, Train Loss: 2.800, Performance metric: 0.781, Mean Accuracy: 0.7220588235294118
0:15:51 Batches 400, Train Loss: 2.796, Performance metric: 0.780, Mean Accuracy: 0.7198529411764707
VALIDATION: Performance metric: 0.768, Mean Accuracy: 0.7263320647002853
0:16:20 Batches 410, Train Loss: 2.794, Performance metric: 0.784, Mean Accuracy: 0.711764705882353
0:16:30 Batches 420, Train Loss: 2.796, Performance metric: 0.784, Mean Accuracy: 0.7205882352941175
0:16:55 Batches 10, Train Loss: 3.141, Performance metric: 0.783, Mean Accuracy: 0.7301470588235294
0:17:05 Batches 20, Train Loss: 2.916, Performance metric: 0.780, Mean Accuracy: 0.7139705882352941
0:17:14 Batches 30, T

0:31:35 Batches 290, Train Loss: 2.723, Performance metric: 0.792, Mean Accuracy: 0.7426470588235294
0:31:45 Batches 300, Train Loss: 2.722, Performance metric: 0.792, Mean Accuracy: 0.7360294117647059


KeyboardInterrupt: 

In [26]:
from keras import backend as K
K.clear_session()

In [27]:
from keras.models import load_model
loaded_model = load_model(best_model_path, custom_objects={'bp_mll_loss': bp_mll_loss})

In [28]:
print_model_performance(loaded_model, X_test)

Performance: 0.775249709457181, Mean Accuracy: 0.7165794481446242


In [29]:
def calc_opt_th(model, data):
    y_true, y_pred = calc_model_predictions(model, data)
    return calc_optimal_thresholds(y_true, y_pred)

optimal_thresholds = calc_opt_th(loaded_model, X_validation)
optimal_thresholds

[0.066745654,
 0.13608873,
 0.11436961,
 0.099687755,
 0.44896445,
 0.22705011,
 0.10063748,
 0.27127913]

In [20]:
def calc_predicted_labels(label_binarizer, y_pred_values, thresholds):
    y_predicted = (y_pred_values > thresholds).astype(int)
    return label_binarizer.inverse_transform(np.array([y_predicted]))[0]

def test_predicted_labels():
    lb = MultiLabelBinarizer()
    labels1 = ['class1', 'class2']
    labels2 = ['class1']
    lb.fit([labels1, labels2])
    predicted_labels = calc_predicted_labels(lb, np.array([0.2, 0.5]), np.array([0.1, 0.6]))
    assert len(predicted_labels) == len(labels2) == 1
    assert predicted_labels[0] == labels2[0]
    
test_predicted_labels()

In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def plot_img(img_file, teached_target, labels, y_pred_values, predicted_values):
    plt.figure(figsize=(6,8))
    img=mpimg.imread(os.path.join(posters_dir, img_file))

    ax1 = plt.subplot(2, 1, 1)
    plt.imshow(img)

    ax = plt.subplot(2, 1, 2)
    plt.barh(range(len(labels)), list(y_pred_values))
    plt.yticks(range(len(labels)), list(labels), fontsize=12)
    ax.set_xlim(right=1.0)

    plt.gcf().text(0, 1.05, 'Labeled as {}'.format(teached_target), fontsize=18)
    plt.gcf().text(0, 1, 'Predicted {}'.format(predicted_values), fontsize=18)
    plt.gcf().text(0, 0.95, os.path.basename(img_file), fontsize=10)
    plt.tight_layout()
    #plt.show()
    
    return plt

In [22]:
example_classifications_path = os.path.join('.', 'classification_examples')

if os.path.exists(example_classifications_path):
    shutil.rmtree(example_classifications_path)

if not os.path.exists(example_classifications_path):
    os.makedirs(example_classifications_path)

count = 0
for batch_inputs, batch_labels, img_files  in batch_generator(X_test, distortion=False):
    predictions = loaded_model.predict(batch_inputs)
    for i in range(batch_inputs.shape[0]):
        count += 1
        encoded_label = np.expand_dims(batch_labels[i], axis=0)
        teached_labels = label_binarizer.inverse_transform(encoded_label)[0]
        prediction = predictions[i]
        predicted_labels = calc_predicted_labels(label_binarizer, prediction, optimal_thresholds)
        plt = plot_img(img_files[i], teached_labels, label_binarizer.classes_, prediction, predicted_labels)
        plt.savefig(os.path.join(example_classifications_path, img_files[i]), bbox_inches='tight')
        plt.close()