# Classifying genres of movie synopses

#  1. Set up Environment

## Required Imports

In [1]:
import pandas as pd

import numpy as np

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.models import Sequential
from keras.layers import Activation, Dense, LSTM, Bidirectional, Embedding, TextVectorization, Input
from keras.optimizers import Adam

from random import shuffle

from time import time

from datetime import timedelta

from tqdm import tqdm




## Constants

In [2]:
CWD = "./data"
TRAINING_DATA = 'Training-dataset.csv'
VALIDATION_DATA = 'Validation-dataset-genres.csv'
TEST_DATA = 'Test-dataset-genres.csv'

ID = 'ID'
TITLE = 'title'
PLOT_SYNOPSIS = 'plot_synopsis'
COMEDY = 'comedy'
CULT = 'cult'
FLASHBACK = 'flashback'
HISTORICAL = 'historical'
MURDER = 'murder'
REVENGE = 'revenge'
ROMANTIC = 'romantic'
SCIFI = 'scifi'
VIOLENCE = 'violence'
TAGS = [COMEDY, CULT, FLASHBACK, HISTORICAL, MURDER, REVENGE,
        ROMANTIC, SCIFI, VIOLENCE]
INDEX_TAGS = [0, 1, 2, 3, 4, 5, 6, 7, 8]

OUTPUT_MODE = 'int'

SYNOPSIS_LENGHT_THRESHOLD = 1000
MAXIMUM_SEQUENCE_LENGTH = 256
MAXIMUM_FEATURE_COUNT = 90000
EMBEDDING_SIZE = 300

EPOCHS = 10
LEARNING_RATE = 1e-3

# 2. Load in Data

## Training Data

In [3]:
train_data = pd.read_csv(f"{CWD}/{TRAINING_DATA}")
print(f'Training movie data count: {len(train_data.values)}')

train_data.dropna(inplace=True)
train_data.reset_index(inplace=True, drop=True)
print(f'Training movie data count after NaN check: {len(train_data.values)}')

train_data = train_data[train_data[PLOT_SYNOPSIS]
                        .apply(lambda synopsis:
                               len(synopsis) >= SYNOPSIS_LENGHT_THRESHOLD)]
train_data.reset_index(inplace=True, drop=True)
print(f'Training movie data count after threshold check: {len(train_data.values)}')

train_data.head(10)

Training movie data count: 8257
Training movie data count after NaN check: 8257
Training movie data count after threshold check: 8188


Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0
5,2fbcdf4a-1c10-4958-a175-985d226f5906,Savage Vengance,Katie Carter (Dallender) is an aspiring model ...,0,0,0,0,1,0,0,0,0
6,60298c01-41d0-4e12-a203-b5a1feb78943,The Snake Pit,Virginia Cunningham (Olivia de Havilland) is a...,0,0,1,0,0,0,0,0,0
7,084f6cc3-e4e2-4f1a-bcbb-b26dbdd2762f,Shadows of Betrayal,In the dark and gritty underbelly of a sprawli...,0,0,0,0,1,0,0,0,1
8,a198118a-564e-42f8-8df2-0cbec828aa2f,Kakushi ken oni no tsume,"The film takes place in Japan in the 1860s, a ...",0,0,0,0,1,1,0,0,0
9,541bbc68-5628-43a3-9f83-49c7900c2e57,Intolerable Cruelty,Donovan Donaly (Geoffrey Rush) a TV soap opera...,1,0,0,0,0,1,1,0,0


## Validation Data

In [4]:
val_data = pd.read_csv(f"{CWD}/{VALIDATION_DATA}")
print(f'Vaidation movie data count: {len(val_data.values)}')

val_data.dropna(inplace=True)
val_data.reset_index(inplace=True, drop=True)
print(f'Validation movie data after NaN check count: {len(val_data.values)}')

val_data.head(10)

Vaidation movie data count: 1188
Validation movie data after NaN check count: 1188


Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,Shattered Hearts,"In the enchanting city of Verona, Italy, renow...",0,0,0,0,1,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,Point Blank,Walker works with his friend Mal Reese to stea...,0,1,1,0,1,1,0,0,1
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,Le charme discret de la bourgeoisie,The film consists of several thematically link...,1,0,1,0,0,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,A Gentleman's Dignity,A Gentleman's Dignity is about the careers and...,0,0,0,0,0,0,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,Carmen: A Hip Hopera,"Carmen Brown (Beyoncé) is a seductive, aspirin...",0,0,0,0,1,0,0,0,0
5,206f3326-2e4a-4003-b55b-1b8b364d05a2,Vals Im Bashir,"In 1982, Ari Folman was a 19-year-old IDF infa...",0,0,1,1,0,0,0,0,0
6,f4355adb-f934-40eb-a71c-60a5cef42f30,Blitz,The movie begins with Detective Sergeant (DS) ...,0,0,1,0,1,1,0,0,0
7,5768f628-38bc-44b4-9523-4f1dfbe19ead,Logan's Run,An opening title sets the scene:Sometime in th...,0,1,1,0,1,0,0,1,1
8,3295f678-a1c6-4a26-9c85-ecd792e6c16b,The Skulls,Luke McNamara (Joshua Jackson) is a student wi...,0,0,1,0,1,0,0,0,0
9,5c99d135-cd86-4e38-9d52-2924c50548ed,Flawless,The 1999 film features De Niro as Walter Koont...,0,0,0,0,1,0,1,0,1


## Test Data

In [5]:
test_data = pd.read_csv(f"{CWD}/{TEST_DATA}")
print(f'Test movie data count: {len(test_data.values)}')

test_data.dropna(inplace=True)
test_data.reset_index(inplace=True, drop=True)
print(f'Test movie data after NaN check count: {len(test_data.values)}')

test_data.head(10)

Test movie data count: 1200
Test movie data after NaN check count: 1200


Unnamed: 0,ID,title,plot_synopsis
0,9484ac61-0e30-4799-9998-6f74f4cbb204,Standoff,"A young girl, Bird with her aunt's boyfriend w..."
1,55942d28-b6a2-4662-ab55-a66783a86a56,On Our Merry Way,Oliver Pease (Burgess Meredith) has deceived h...
2,b71ed317-04cd-42f5-a380-d21dfea2bd36,My Life Without Me,Ann (Sarah Polley) is a hard-working 23-year-o...
3,5689b1b2-88cd-4c22-9114-0850ba539280,Ben & Arthur,Ben (Jamie Brett Gabel) and Arthur (Sam Mraovi...
4,a0d9062e-f539-4043-bc9e-2a2ed589477b,Where the Sidewalk Ends,New York City 16th Precinct Police Detective D...
5,79da254c-b13a-4f82-8a0a-732c4d1bca5f,Estratto dagli archivi segreti della polizia d...,"After a day of sailing, a group of hippies—Jan..."
6,9787f4bc-96d4-4f9a-9fed-753434d23d6e,The Honeymoon Killers,The film opens by introducing Martha Beck (Shi...
7,4209b506-2b47-4430-a5f8-04c9218d5728,Quick Change,"Grimm, dressed as a clown, robs a bank in midt..."
8,bb4f8bc5-6058-45a7-baa3-5c6b7bc6d386,Halo: The Fall of Reach,=== Setting and characters ===\nThe Fall of Re...
9,10185a2f-dac9-4db7-8430-45f43dfdf8f0,Jab Tak Hai Jaan,"Samar Anand (Shahrukh Khan), a major in the In..."


## Extract relevant data




In [6]:
# Prepare training, validation and testing data
X_train = train_data[PLOT_SYNOPSIS]
y_train = train_data[TAGS]

X_val = val_data[PLOT_SYNOPSIS]
y_val = val_data[TAGS]

X_test = test_data[PLOT_SYNOPSIS]

## Helper Functions

In [7]:
def get_vectorization_layer(texts):
  """
  Create an input vectorization layer based on the tokens given in texts.
  It maps tokens to indices to a maximum feature count.

  Parameters
  ----------
  texts : pandas.Series of str
      Series of strings containing all documents which will be used to
      create a mapping from token to index.

  Returns
  -------
  keras.layers.TextVectorization
      A TextVectorization object that maps tokens to indices.
  """
  vectorization_layer = TextVectorization(
    max_tokens=MAXIMUM_FEATURE_COUNT,
    output_mode=OUTPUT_MODE,
    output_sequence_length=MAXIMUM_SEQUENCE_LENGTH)

  vectorization_layer.adapt(texts)
  return vectorization_layer

In [8]:
def get_embedding_dictionary():
  """
  Create an embedding dictionary from a given pre-trained embedding.

  Returns
  -------
  dict of str to np.ndarray
      A mapping for words to their pre-trained embeddings.
  """
  embeddings_dictionary = dict()
  # Embeddings can be found here https://nlp.stanford.edu/projects/glove/ under
  # the glove.6B embeddings
  glove_file = open(f"{CWD}/glove.6B.300d.txt", encoding="utf8")
  for line in glove_file:
      records = line.split()
      word = records[0]
      vector_dimensions = np.asarray(records[1:], dtype='float32')
      embeddings_dictionary[word] = vector_dimensions
  glove_file.close()
  return embeddings_dictionary


def get_embedding_matrix(text_vectorizer):
  """
  Create an embedding matrix using a vectorisation layer and an embedding
  dictionary. This is done by extracting the necessary tokens from the
  vectorisation layer and then adding their embedding mappings to the matrix.

  Parameters
  ----------
  text_vectorizer : keras.layers.TextVectorization
      A text vectorizer that contains information about which tokens map
      to which indices and the vocabulary of the system.

  Returns
  -------
  numpy.ndarray
      A TextVectorization object that maps tokens to indices.
  """
  embeddings_dictionary = get_embedding_dictionary()
  num_words = min(MAXIMUM_FEATURE_COUNT, len(text_vectorizer.get_vocabulary()) + 1)
  embedding_matrix = np.zeros((num_words, EMBEDDING_SIZE))

  for idx, word in enumerate(text_vectorizer.get_vocabulary()):
    if idx >= MAXIMUM_FEATURE_COUNT:
        break
    embedding_vector = np.zeros(EMBEDDING_SIZE)
    try:
      embedding_vector = embeddings_dictionary[word]
    except:
      a = 0
      # print(f'{word} is missing! Using zeroes')
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

  return embedding_matrix

# 3. Implement Methods

## METHOD a) Traditional Classification Method
### SVM (Support Vector Machine with a Linear Kernel and Term Frequency Inverse Document Frequency)

### Define Model Architecture

In [9]:
svm = Pipeline([
    ('tfidf', TfidfVectorizer(input='content',
                              encoding='utf-8',
                              decode_error='strict',
                              strip_accents=None,
                              lowercase=True,
                              preprocessor=None,
                              tokenizer=None,
                              analyzer='word',
                              stop_words=None,
                              token_pattern=r"(?u)\b\w\w+\b",
                              ngram_range=(1, 1),
                              max_df=1.0,
                              min_df=1,
                              max_features=80000,
                              vocabulary=None,
                              binary=False,
                              norm='l2',
                              use_idf=True,
                              smooth_idf=True,
                              sublinear_tf=False
                              )),
    ('clf', OneVsRestClassifier(LinearSVC(penalty='l2',
                                          loss='hinge',
                                          dual=True,
                                          tol=1e-4,
                                          C=1.0,
                                          multi_class='ovr',
                                          fit_intercept=True,
                                          intercept_scaling=1.0,
                                          class_weight='balanced',
                                          verbose=0,
                                          random_state=None,
                                          max_iter=1000))),
])

### Train and Run Model

In [None]:
results_val = [val_data[ID]]
results_test = [test_data[ID]]

train_times = []
val_predict_times = []
test_predict_times = []

print('Training on classes...')
for tag in tqdm(TAGS):

    start_train = time()
    svm.fit(X_train, y_train[tag])
    end_train = time()
    train_times.append(end_train - start_train)

    # Make a prediction for the validation data
    start_val = time()
    result_val = svm.predict(X_val)
    end_val = time()
    results_val.append(pd.Series(result_val))
    val_predict_times.append(end_val - start_val)

    # Make a prediction for the test data
    start_test = time()
    result_test = svm.predict(X_test)
    end_test = time()
    results_test.append(pd.Series(result_test))
    test_predict_times.append(end_test - start_test)

predictions_val = pd.concat(results_val, axis=1)
predictions_val.columns = [ID, *INDEX_TAGS]

predictions_test = pd.concat(results_test, axis=1)
predictions_test.columns = [ID, *INDEX_TAGS]


train_time = sum(train_times)
val_time = sum(val_predict_times)
test_time = sum(test_predict_times)

# Calculate elapsed time
print("")
elapsed_time = str(timedelta(seconds=train_time))
print(f"Training Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")
elapsed_time = str(timedelta(seconds=val_time))
print(f"Validation Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")
elapsed_time = str(timedelta(seconds=test_time))
print(f"Testing Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

### Save Results

In [11]:
predictions_val.to_csv('svm_predictions_validation.csv', header=None, index=False, encoding="utf8")

In [12]:
predictions_test.to_csv('svm_predictions_test.csv', header=None, index=False)

In [None]:
!python3 scripts/genres_similarity_eval_script.py svm_predictions_validation.csv data/Validation-dataset-genres.csv

## METHOD B) Traditional Deep Learning Method
### Bi-LSTM (Bidirectional Long-Short Term Memory Neural Network)

### Prepare Data

In [14]:
all_texts = []
for texts in [X_train.values, X_val.values, X_test.values]:
  all_texts.extend(texts)

### Create Input Layers

Create vectorization layer

In [None]:
VECTORIZATION_LAYER = get_vectorization_layer(all_texts)

Create embedding layer

In [None]:
EMBEDDING_MATRIX = get_embedding_matrix(VECTORIZATION_LAYER)

### Define Model Architecture

Design neural network architecture

In [None]:
# Create model
model = Sequential()

model.add(VECTORIZATION_LAYER)

# Embedding Layer
model.add(Embedding(MAXIMUM_FEATURE_COUNT,
                    EMBEDDING_SIZE,
                    weights=[EMBEDDING_MATRIX],
                    trainable = False))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(MAXIMUM_SEQUENCE_LENGTH,
                             activation='tanh')))

# Dense connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer
model.add(Dense(9, activation='sigmoid'))

Compile model

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    optimizer='Adam',
    metrics=['accuracy'])
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_6 (Text  (None, 256)               0         
 Vectorization)                                                  
                                                                 
 embedding_10 (Embedding)    (None, 256, 300)          27000000  
                                                                 
 bidirectional_10 (Bidirect  (None, 512)               1140736   
 ional)                                                          
                                                                 
 dense_40 (Dense)            (None, 128)               65664     
                                                                 
 dense_41 (Dense)            (None, 256)               33024     
                                                                 
 dense_42 (Dense)            (None, 128)             

### Run model

Train model

In [None]:
start_time = time()

history = model.fit(X_train, y_train.values, epochs=8,
                    verbose=1, validation_split=0.2)

end_time = time()

# Calculate elapsed time
elapsed_time = str(timedelta(seconds=end_time - start_time))
print(f"Elapsed Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Elapsed Time: 01:14.750939


Make predictions

In [None]:
doc_ids = val_data['ID']

start_time = time()

predicted_values_val = (model.predict(X_val))
predicted_classes_val = pd.DataFrame((predicted_values_val > 0.5).astype(int))
predictions_val = pd.concat([doc_ids, predicted_classes_val], axis=1)

end_time = time()

# Calculate elapsed time
elapsed_time = str(timedelta(seconds=end_time - start_time))
print(f"Validation Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

doc_ids = test_data['ID']

start_time = time()

predicted_values_test = (model.predict(X_test))
predicted_classes_test = pd.DataFrame((predicted_values_test > 0.5).astype(int))
predictions_test = pd.concat([doc_ids, predicted_classes_test], axis=1)


end_time = time()

# Calculate elapsed time
elapsed_time = str(timedelta(seconds=end_time - start_time))
print(f"Test Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

Validation Time: 00:00.907010
Test Time: 00:00.721177


### Save results

In [None]:
predictions_val.to_csv('lstm_predictions_validation.csv', header=False, index=False)

In [None]:
predictions_test.to_csv('lstn_predictions_test.csv', header=False, index=False)

In [None]:
!python3 scripts/genres_similarity_eval_script.py lstm_predictions_validation.csv data/Validation-dataset-genres.csv

Class level: 
Class  1 precision: 0.3171 recall: 0.2229
Class  2 precision: 0.4089 recall: 0.3725
Class  3 precision: 0.3604 recall: 0.3469
Class  4 precision: 0.0000 recall: 0.0000
Class  5 precision: 0.6203 recall: 0.6523
Class  6 precision: 0.2872 recall: 0.1139
Class  7 precision: 0.5361 recall: 0.3586
Class  8 precision: 0.5000 recall: 0.0968
Class  9 precision: 0.5346 recall: 0.5524
----------------------------
Movie (document) level: 
Precision: 0.4853
Recall: 0.4566
