In [222]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [228]:
data = pd.read_csv('movie_metadata.csv')
data_useful = data[['budget', 'duration','director_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes','actor_3_facebook_likes','gross','imdb_score','title_year','aspect_ratio','cast_total_facebook_likes','facenumber_in_poster']]
data_useful = data_useful.drop(['gross'], axis = 1)
data_useful = data_useful.dropna()

In [224]:
def shuffle_together(a, b):
    assert len(a) == len(b)
    shuffled_a = np.empty(a.shape, dtype=a.dtype)
    shuffled_b = np.empty(b.shape, dtype=b.dtype)
    permutation = np.random.permutation(len(a))
    for old_index, new_index in enumerate(permutation):
        shuffled_a[new_index] = a[old_index]
        shuffled_b[new_index] = b[old_index]
    return shuffled_a, shuffled_b

def order_score(a,b):
    assert len(a)==len(b)
    new_a = np.empty(a.shape, dtype=a.dtype)
    new_b = np.empty(b.shape, dtype=b.dtype)
    maxim = max(b)
    for i in range(len(b)):
        ind = np.argmin(b)
        new_b[i] = b[ind]
        new_a[i] = a[ind]
        b[ind] = maxim + 1
    return new_a, new_b

def prep_data(num_classes, test_data_perc, data_useful, verbose = 1):
    x_not_norm = np.array(data_useful.drop(['imdb_score'], axis=1))
    y_not_norm = np.array(data_useful['imdb_score'])
    [x_not_norm, y_not_norm] = order_score(x_not_norm, y_not_norm)

    # Network Parameters
    num_films = np.shape(x_not_norm)[0]
    num_input = np.shape(x_not_norm)[1] # number of inputs

    # Normalise data
    x = np.zeros(np.shape(x_not_norm))
    for i in range(num_input):
        maxim = max(x_not_norm[:, i]) 
        minim = min(x_not_norm[:, i]) 
        x[:,i] = (x_not_norm[:, i] - minim) / (maxim - minim)
    if verbose:
        print('x = ', np.shape(x))
    
    films_per_class = int(num_films / num_classes) + 1
    y = np.zeros(len(y_not_norm))
    y = y.astype(int)
    for i in range(len(y)):
        y[i] = int(i / films_per_class)
    if verbose:
        print('y = ', np.shape(y))
    
    score_separation = np.zeros(num_classes + 1)
    score_separation[num_classes] = 10
    for i in range(num_classes):
        score_separation[i] = y_not_norm[i*films_per_class+1]
    
#     score_separation = 10 / num_classes
#     y = np.zeros(len(y_not_norm))
#     y = y.astype(int)
#     for i in range(len(y)):
#         y[i] = int(y_not_norm[i] / score_separation)
#     print('y = ', np.shape(y))


        
    [x, y] = shuffle_together(x, y)

    x_train = x[:int( (1-test_data_perc) * len(x))]
    x_test  = x[len(x_train):]
    y_train = y[:len(x_train)]
    y_test  = y[len(x_train):]

    if verbose:
        print('x_train = ', np.shape(x_train))
        print('x_test = ', np.shape(x_test))

        counts = np.bincount(y_train)
        for i in range(num_classes):
            print('films ', score_separation[i], 'to', score_separation[i+1],
                  ' -> ', counts[i]/len(y_train)*100)

    return [num_input, x_train, y_train, x_test, y_test]

In [230]:
reps = 5
acc = 0
loss = 0
num_classes = 2 # How many score divisions we want
ep = 5 # Epochs per repetition
neu = 20 # Neurons in mid layers
for i in range(reps):
    [num_input, x_train, y_train, x_test, y_test] = prep_data(num_classes, .1, data_useful, verbose = 1)
    model = keras.Sequential([
        keras.layers.Dense(num_input, activation=tf.nn.relu),
        keras.layers.Dense(neu, activation=tf.nn.relu),
        keras.layers.Dense(num_classes, activation=tf.nn.sigmoid)
    ])
    model.compile(optimizer='adam',
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=ep, verbose = 0)
    test_loss, test_acc = model.evaluate(x_test, y_test)
    acc += test_acc
    loss += test_loss
    print(i + 1, 'REPS DONE')
    
print('accuracy = ', acc / reps)
print('loss = ', loss / reps)
print('percentage of films per category = ', 1/num_classes)
    

x =  (4293, 10)
y =  (4293,)
x_train =  (3863, 10)
x_test =  (430, 10)
films  1.9 to 6.6  ->  49.469324359306235
films  6.6 to 10.0  ->  50.53067564069376
1 REPS DONE
x =  (4293, 10)
y =  (4293,)
x_train =  (3863, 10)
x_test =  (430, 10)
films  1.9 to 6.6  ->  50.064716541548016
films  6.6 to 10.0  ->  49.93528345845198
2 REPS DONE
x =  (4293, 10)
y =  (4293,)
x_train =  (3863, 10)
x_test =  (430, 10)
films  1.9 to 6.6  ->  49.90939684183277
films  6.6 to 10.0  ->  50.09060315816723
3 REPS DONE
x =  (4293, 10)
y =  (4293,)
x_train =  (3863, 10)
x_test =  (430, 10)
films  1.9 to 6.6  ->  50.349469324359305
films  6.6 to 10.0  ->  49.650530675640695
4 REPS DONE
x =  (4293, 10)
y =  (4293,)
x_train =  (3863, 10)
x_test =  (430, 10)
films  1.9 to 6.6  ->  50.220036241263266
films  6.6 to 10.0  ->  49.779963758736734
5 REPS DONE
accuracy =  0.6218604683876038
loss =  0.645280154250389
percentage of films per category =  0.5
