# Movies and Models
### Predicting Movie Genre and Popularity


#### Import Dataset and Clean

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from keras.callbacks import Callback
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Embedding, BatchNormalization
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.utils import to_categorical

In [34]:
#import dataset
data = pd.read_csv("./the-movies-dataset/movies_metadata.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
#extract columns that we need: genres, imdb_id, popularity, overview, vote_average, vote_count, original title
dataset = data[['original_title', 'imdb_id', 'overview', 'popularity', 'genres', 'vote_average', 'vote_count']]

In [36]:
#dataset overview
print(dataset.shape)
dataset

(45466, 7)


Unnamed: 0,original_title,imdb_id,overview,popularity,genres,vote_average,vote_count
0,Toy Story,tt0114709,"Led by Woody, Andy's toys live happily in his ...",21.9469,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0
1,Jumanji,tt0113497,When siblings Judy and Peter discover an encha...,17.0155,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0
2,Grumpier Old Men,tt0113228,A family wedding reignites the ancient feud be...,11.7129,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",6.5,92.0
3,Waiting to Exhale,tt0114885,"Cheated on, mistreated and stepped on, the wom...",3.85949,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",6.1,34.0
4,Father of the Bride Part II,tt0113041,Just when George Banks has recovered from his ...,8.38752,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0
5,Heat,tt0113277,"Obsessive master thief, Neil McCauley leads a ...",17.9249,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0
6,Sabrina,tt0114319,An ugly duckling having undergone a remarkable...,6.67728,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",6.2,141.0
7,Tom and Huck,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",2.56116,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",5.4,45.0
8,Sudden Death,tt0114576,International action superstar Jean Claude Van...,5.23158,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",5.5,174.0
9,GoldenEye,tt0113189,James Bond must unmask the mysterious head of ...,14.686,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0


In [37]:
#clean up dataset bad values
dataset = dataset[dataset['popularity'] != 0.0]
dataset = dataset[dataset['genres'] != '[]']
dataset = dataset.dropna()

In [38]:
#create series for each column we're interested in
overviews = dataset['overview']
popularities = dataset['popularity']
genres = dataset['genres']

In [39]:
#callback function to get accuracy, precision, recall and f1score - taken and adapted from: https://github.com/keras-team/keras/issues/5794
class Metrics(Callback):

    def on_train_begin(self, logs={}):
     self.val_f1s = []
     self.val_recalls = []
     self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
     val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
     val_targ = self.validation_data[1]
     _val_f1 = f1_score(val_targ, val_predict, average='weighted')
     _val_recall = recall_score(val_targ, val_predict, average='weighted')
     _val_precision = precision_score(val_targ, val_predict, average='weighted')
     self.val_f1s.append(_val_f1)
     self.val_recalls.append(_val_recall)
     self.val_precisions.append(_val_precision)
     print("val_f1: " + str(_val_f1) + " val_precision: " + str(_val_precision) + " val_recall " + str(_val_recall))
     return
 
metrics = Metrics()

In [40]:
#function to pick random genre
def find_genre(x):
    lst = []
    for elt in x.split():
        if elt[1:-3].istitle():
            lst.append(elt[1:-3])
    if not lst:
        return 'Drama'
    return lst[0]

In [41]:
#select random genre for each datapoint
genres = genres.apply(lambda x: find_genre(x))
print(set(genres))

{'Action', 'Thriller', 'War', 'Horror', 'Western', 'Documentary', 'History', 'Crime', 'Animation', 'Adventure', 'Fantasy', 'Music', 'Drama', 'Scie', 'Comedy', 'Foreign', 'Family', 'Mystery', 'Romance'}


#### Using Models to Predict Genre Based on Overview

In [42]:
#split dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(overviews, genres, test_size=0.4, random_state=0)

In [43]:
#create Tokenizer of at most 10000 words and train on all overviews
max_idx = 10000
t = Tokenizer(num_words=max_idx)

t.fit_on_texts(overviews)

In [44]:
#transform overviews to integer sequences using tokenizer
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [45]:
# Turn all training examples to length of 600 using padding
max_len = 600

X_train = sequence.pad_sequences(X_train, max_len)
X_test = sequence.pad_sequences(X_test, max_len)

print("new x_train shape is: ", X_train.shape)
print("new x_test shape is: ", X_test.shape)

new x_train shape is:  (25377, 600)
new x_test shape is:  (16918, 600)


In [46]:
#one-hot vectorize genre labels
lb = preprocessing.LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [47]:
# Create Recurrent Model for genres
model = Sequential()

# Add embedding to words
model.add(Embedding(input_dim=max_idx+1, output_dim=64,
                    input_length=max_len))

# Add 1D Convolutional layer
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(Dropout(.15))
model.add(MaxPooling1D(pool_size=2))

# Add LSTM layer
model.add((LSTM(16, return_sequences=False, recurrent_dropout=.15)))
model.add(Dropout(.15))

# Add dense layer
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(.2))

# Add final softmax layer
model.add(Dense(19))
model.add(Activation('softmax'))

# Compile model using categorical crossentropy
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [48]:
#fit model to data
model.fit(X_train, y_train, batch_size=512,
          epochs=7, verbose=1, validation_data=(X_test, y_test), callbacks=[metrics])

Train on 25377 samples, validate on 16918 samples
Epoch 1/7


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


val_f1: 0.0 val_precision: 0.0 val_recall 0.0
Epoch 2/7
val_f1: 0.11925088245903313 val_precision: 0.3277275701073897 val_recall 0.08665326870788509
Epoch 3/7
val_f1: 0.17431635525700165 val_precision: 0.379013692544012 val_recall 0.11792174015841116
Epoch 4/7
val_f1: 0.20488739674634285 val_precision: 0.37800789320520706 val_recall 0.144875280766048
Epoch 5/7
val_f1: 0.23110508899045115 val_precision: 0.39400610459110197 val_recall 0.1746069275328053
Epoch 6/7
val_f1: 0.2629134619468008 val_precision: 0.3608069129267453 val_recall 0.22674074949757655
Epoch 7/7
val_f1: 0.2827304459481845 val_precision: 0.36108002432307107 val_recall 0.2695945147180518


<keras.callbacks.History at 0x10877ea90>

In [49]:
# Create Feed-Forward Model for genre
model = Sequential()

# Create first dense layer
model.add(Dense(512, input_shape=(600,)))
model.add(Activation('relu'))

# Create second dense layer
model.add(Dense(512))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Create third dense layer
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Create final sigmoid layer
model.add(Dense(19))
model.add(Activation('softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [50]:
#fit model to data
model.fit(X_train, y_train, batch_size=512,
          epochs=20, verbose=1, validation_data=(X_test, y_test), callbacks=[metrics])

Train on 25377 samples, validate on 16918 samples
Epoch 1/20
val_f1: 0.01058679239925467 val_precision: 0.09844624400000836 val_recall 0.007152145643693108
Epoch 2/20
 2560/25377 [==>...........................] - ETA: 1s - loss: 2.2854 - acc: 0.2895

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


val_f1: 0.012069994552770176 val_precision: 0.18824214557592162 val_recall 0.006501950585175552
Epoch 3/20
val_f1: 0.013757428829395527 val_precision: 0.15923273470696866 val_recall 0.007388580210426764
Epoch 4/20
val_f1: 0.004974300907856484 val_precision: 0.14954684660079037 val_recall 0.002541671592386807
Epoch 5/20
val_f1: 0.012466199366624901 val_precision: 0.18627395878864872 val_recall 0.006501950585175552
Epoch 6/20
val_f1: 0.028358398162374465 val_precision: 0.18135099026543042 val_recall 0.015900224612838396
Epoch 7/20
val_f1: 0.0531807071722588 val_precision: 0.18713726329656818 val_recall 0.03605627142688261
Epoch 8/20
val_f1: 0.0445765896096374 val_precision: 0.2199921045340259 val_recall 0.026066910982385626
Epoch 9/20
val_f1: 0.043723281900070826 val_precision: 0.17071372859032496 val_recall 0.025889585057335383
Epoch 10/20
val_f1: 0.062067679285002286 val_precision: 0.1693481828395766 val_recall 0.04031209362808843
Epoch 11/20
val_f1: 0.05682149816813186 val_precision: 

<keras.callbacks.History at 0x1a21dab9b0>

In [51]:
#create logistic regression classifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [52]:
#fit classifier on training data
y_train = np.argmax(y_train, axis=1)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [53]:
#test and evaluate classifier on test data
y_test = np.argmax(y_test, axis=1)

print("Accuracy Score: " + str(accuracy_score(y_test, clf.predict(X_test))))
print("Precision: " + str(precision_score(y_test, clf.predict(X_test), average='weighted')))
print("Recall: " + str(recall_score(y_test, clf.predict(X_test), average='weighted')))
print("F1 Score: " + str(f1_score(y_test, clf.predict(X_test), average='weighted')))

Accuracy Score: 0.2688852110178508
Precision: 0.1471210261457681
Recall: 0.2688852110178508
F1 Score: 0.15291740359424152


In [54]:
#create naive bayes classifier
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

In [55]:
#fit classifier on training data
nb.fit(X_train, y_train)

GaussianNB(priors=None)

In [71]:
#test and evaluate classifier on test data
print("Accuracy Score: " + str(accuracy_score(y_test, nb.predict(X_test))))
print("Precision: " + str(precision_score(y_test, nb.predict(X_test), average='weighted')))
print("Recall: " + str(recall_score(y_test, nb.predict(X_test), average='weighted')))
print("F1 Score: " + str(f1_score(y_test, nb.predict(X_test), average='weighted')))

Accuracy Score: 0.0074476888521101785
Precision: 0.17048801066935934
Recall: 0.0074476888521101785
F1 Score: 0.00730608205646312


#### Using Models to Predict Popularity Based on Overview

In [61]:
#split data into training and testing sets (60-40)
X_train, X_test, y_train2, y_test2 = train_test_split(overviews, popularities, test_size=0.4, random_state=0)

In [62]:
#create Tokenizer of at most 10000 words and train on all overviews
max_idx = 10000
t = Tokenizer(num_words=max_idx)

t.fit_on_texts(overviews)

In [63]:
#transform overviews to integer sequences using tokenizer
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [64]:
# Turn all training examples to length of 600 using padding
max_len = 600

X_train = sequence.pad_sequences(X_train, max_len)
X_test = sequence.pad_sequences(X_test, max_len)

print("new x_train shape is: ", X_train.shape)
print("new x_test shape is: ", X_test.shape)

new x_train shape is:  (25377, 600)
new x_test shape is:  (16918, 600)


In [65]:
#clean and adapt popularity scores
y_train2 = y_train2.astype(np.float)
y_test2 = y_test2.astype(np.float)
y_train2 = y_train2.apply(lambda x: np.log(x+1))
y_test2 = y_test2.apply(lambda x: np.log(x+1))

In [66]:
#scale popularity scores to 0-100 range
scaler = MinMaxScaler(feature_range=(0,100))
scaler.fit(y_train2.values.reshape(-1, 1))

y_train2 = scaler.transform(y_train2.values.reshape(-1, 1))
y_test2 = scaler.transform(y_test2.values.reshape(-1, 1))

In [67]:
# Create recurrent model for popularity
model = Sequential()

# Add embedding to words
model.add(Embedding(input_dim=max_idx+1, output_dim=64,
                    input_length=max_len))

# Add 1D Convolutional layer
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(Dropout(.15))
model.add(MaxPooling1D(pool_size=2))

# Add LSTM layer
model.add((LSTM(16, return_sequences=False, recurrent_dropout=.15)))
model.add(Dropout(.15))

# Add dense layer
model.add(Dense(32))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(.2))

# Add final dense layer layer
model.add(Dense(1))
model.add(Activation('linear'))

# Compile model with mse
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mae'])

In [68]:
#fit model to data
model.fit(X_train, y_train2, batch_size=512,
          epochs=7, verbose=1, validation_data=(X_test, y_test2))

Train on 25377 samples, validate on 16918 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1a29d07c50>

In [69]:
# Create feed-forward model for popularity
model = Sequential()

# Create first dense layer
model.add(Dense(512, input_shape=(600,)))
model.add(Activation('relu'))

# Create second dense layer
model.add(Dense(512))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Create third dense layer
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Create final sigmoid layer
model.add(Dense(1))
model.add(Activation('linear'))

# Compile model
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mae'])

In [70]:
#fit model to data
model.fit(X_train, y_train2, batch_size=512,
          epochs=20, verbose=1, validation_data=(X_test, y_test2))

Train on 25377 samples, validate on 16918 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a2c21c5f8>