## Task 2 - OPTION A - Naïve Bayes classifier

In [69]:
#a) Developing a “traditional” classification method (Naïve Bayes classifier or SVM)
import time
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
#using "OneVsRest" strategy to extend Naive Bayes to handle multiple labels

train_df = pd.read_csv('./data/Training-dataset.csv')
val_df = pd.read_csv('./data/Task-2-validation-dataset.csv')
test_df = pd.read_csv('./data/Task-2-test-dataset1.csv')
train_df['all_text'] = train_df['title'] + ' ' + train_df['plot_synopsis']
val_df['all_text'] = val_df['title'] + ' ' + val_df['plot_synopsis']
test_df['all_text'] = test_df['title'] + ' ' + test_df['plot_synopsis']

In [70]:
#TF-IDF vectorizer converts all_text -> numerical features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(train_df['all_text'])
X_val = vectorizer.transform(val_df['all_text'])
X_test = vectorizer.transform(test_df['all_text'])

classifier = OneVsRestClassifier(MultinomialNB())

In [73]:
# label columns -> integers
label_columns = train_df.columns[3:12]
train_df[label_columns] = train_df[label_columns].astype(int)
val_df[label_columns] = val_df[label_columns].astype(int)

y_train = train_df[label_columns].values.astype(int)

start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print(f"{end-start}s to train data")

# y_pred_test = classifier.predict(X_test)

0.13539481163024902s to train data


# Validation

In [74]:
prediction_df = '10864332-Task2-method-a-validation.csv'
result_df = pd.DataFrame(columns=['doc_id', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
labels = []
start = time.time()
for index, row in val_df.iterrows():
  doc_id = row['ID']
  all_text = row['all_text']
  X_doc = vectorizer.transform([all_text])
  y_pred = classifier.predict(X_doc)

  if all(label == 0 for label in y_pred[0]):
      y_pred[0][random.randint(0, 8)] = 1

  labels.append({
      'doc_id': doc_id,
      'comedy': y_pred[0][0],
      'cult': y_pred[0][1],
      'flashback': y_pred[0][2],
      'historical': y_pred[0][3],
      'murder': y_pred[0][4],
      'revenge': y_pred[0][5],
      'romantic': y_pred[0][6],
      'scifi': y_pred[0][7],
      'violence': y_pred[0][8]
    })
end = time.time()
print(f"{end-start}s to classify development data")
result_df = pd.DataFrame(labels)
result_df.to_csv(prediction_df, index=False, header=False)

3.9126155376434326s to classify development data


In [75]:
!python task2_eval_script_student_version.py 10864332-Task2-method-a-validation.csv ./data/Task-2-validation-dataset.csv

Class level: 
Class  1 precision: 0.2407 recall: 0.0743
Class  2 precision: 0.4118 recall: 0.1134
Class  3 precision: 0.4032 recall: 0.0850
Class  4 precision: 0.0208 recall: 0.0417
Class  5 precision: 0.6667 recall: 0.6919
Class  6 precision: 0.2115 recall: 0.0464
Class  7 precision: 0.5957 recall: 0.2897
Class  8 precision: 0.0857 recall: 0.0968
Class  9 precision: 0.6195 recall: 0.5000
----------------------------
Movie (document) level: 
Precision: 0.5309
Recall: 0.3757


# Test

In [76]:
prediction_df = '10864332-Task2-method-a.csv'
test_result_df = pd.DataFrame(columns=['doc_id', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
labels = []
start = time.time()
for index, row in test_df.iterrows():
  doc_id = row['ID']
  all_text = row['all_text']
  X_doc = vectorizer.transform([all_text])
  y_pred_test = classifier.predict(X_doc)

  if all(label == 0 for label in y_pred_test[0]):
      y_pred_test[0][random.randint(0, 8)] = 1

  labels.append({
      'doc_id': doc_id,
      'comedy': y_pred_test[0][0],
      'cult': y_pred_test[0][1],
      'flashback': y_pred_test[0][2],
      'historical': y_pred_test[0][3],
      'murder': y_pred_test[0][4],
      'revenge': y_pred_test[0][5],
      'romantic': y_pred_test[0][6],
      'scifi': y_pred_test[0][7],
      'violence': y_pred_test[0][8]
    })
end = time.time()
print(f"{end-start}s to classify test data")

test_result_df = pd.DataFrame(labels)
test_result_df.to_csv(prediction_df, index=False, header=False)

4.10150933265686s to classify test data


In [13]:
# !python task2_eval_script_student_version.py 10864332-Task-2-method-a.csv ./data/Task-2-test-dataset1.csv

# OPTION B

In [None]:
#b) develop a “traditional” deep learning method (either LSTM or bi-directional LSTM);
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

train_df = pd.read_csv('./data/Training-dataset.csv')
val_df = pd.read_csv('./data/Task-2-validation-dataset.csv')
test_df = pd.read_csv('./data/Task-2-test-dataset1.csv')

train_df['text'] = train_df['title'] + ' ' + train_df['plot_synopsis']
val_df['text'] = val_df['title'] + ' ' + val_df['plot_synopsis']
test_df['text'] = test_df['title'] + ' ' + test_df['plot_synopsis']

labels = train_df.columns[3:12].values

In [78]:
#tokenize
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
val_sequences = tokenizer.texts_to_sequences(val_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
#psd
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [None]:
# Convert labels to multi-label binary format
mlb = MultiLabelBinarizer(classes=labels)
train_labels = mlb.fit_transform(train_df[labels].apply(lambda row: row.astype(str), axis=1).values)
val_labels = mlb.transform(val_df[labels].apply(lambda row: row.astype(str), axis=1).values)

In [None]:
# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=max_words,
        output_dim=64,
        input_length=max_len,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(32)
    ),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(labels), activation='sigmoid')
])

# model.summary()

compile
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

start = time.time()
#train model
history = model.fit(
    train_padded, train_labels,
    epochs=5,
    validation_data=(val_padded, val_labels),
    verbose=2
)
end = time.time()
print(f"{end-start}s to train")

In [48]:
# train_df.columns[3:12].values == ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

# Validation

In [None]:
predictions = []
prediction_df = '10864332-Task2-method-b-validation.csv'
start = time.time()
for index, row in val_df.iterrows():
    text = row['text']
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    #prediction for each label
    prediction = model.predict(padded_sequence)[0]

    #dynamic threshold based on max predicted probabillity
    pred = (prediction > (max(prediction)*0.5)).astype(int)

    if all(label == 0 for label in pred):
      prediction[0][random.randint(0, 8)] = 1

    result = {
        'doc_id': row['ID'],
        'comedy': pred[0],
        'cult': pred[1],
        'flashback': pred[2],
        'historical': pred[3],
        'murder': pred[4],
        'revenge': pred[5],
        'romantic': pred[6],
        'scifi': pred[7],
        'violence': pred[8]
    }
    predictions.append(result)
end = time.time()
print(f"{end-start}s to class")

result_df = pd.DataFrame(predictions)
result_df.to_csv(prediction_df, index=False, header=False)




In [None]:
!python task2_eval_script_student_version.py 10864332-Task2-method-b-validation.csv ./data/Task-2-validation-dataset.csv

# Text


In [None]:
predictions = []
prediction_df = '10864332-Task2-method-b.csv'
start = time.time()
for index, row in test_df.iterrows():
    text = row['text']
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    #prediction for each label
    prediction = model.predict(padded_sequence)[0]

    pred = (prediction > (max(prediction)*0.5)).astype(int)

    if all(label == 0 for label in pred):
      prediction[0][random.randint(0, 8)] = 1

    result = {
        'doc_id': row['ID'],
        'comedy': pred[0],
        'cult': pred[1],
        'flashback': pred[2],
        'historical': pred[3],
        'murder': pred[4],
        'revenge': pred[5],
        'romantic': pred[6],
        'scifi': pred[7],
        'violence': pred[8]
    }

    predictions.append(result)
end = time.time()
print(end-start + "s = classify all test data")
test_result_df = pd.DataFrame(predictions)
test_result_df.to_csv(prediction_df, index=False, header=False)


In [2]:
#SUBMIT TEST SET &&&&&&&&&&&&&&&&&&&&&&&&&&& VALIDATION SET