In [87]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from keras import Sequential, Input
from keras.layers import TextVectorization, LSTM, Embedding, Dense, SpatialDropout1D
from keras.losses import BinaryCrossentropy
from keras.metrics import F1Score, BinaryAccuracy

In [88]:
dataset = pd.read_csv('data/Training-dataset.csv')

In [89]:
dataset.sum(numeric_only=True)

comedy        1262
cult          1801
flashback     1994
historical     186
murder        4019
revenge       1657
romantic      2006
scifi          204
violence      3064
dtype: int64

In [90]:
dataset[dataset.columns[3:]].to_numpy()

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 1, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [91]:
X_train = dataset['plot_synopsis']
y_train = dataset[dataset.columns[3:]].to_numpy()

###Method A

In [92]:
# create pipeline
NB_clf = Pipeline([('vect', CountVectorizer()),
                   ('NB', MultiOutputClassifier(ComplementNB(alpha=1, norm=False)))])

# train model
NB_clf.fit(X_train, y_train)

In [93]:
def get_results(inputf, outputf, model, run_eval=False, threshold=None):
  # read in validation/test dataset
  data = pd.read_csv(inputf)

  # get predictions
  X_test = data['plot_synopsis']
  predictions = model.predict(X_test)

  # to 0s and 1s
  if threshold:
    predictions = (predictions > threshold).astype(int)

  # create results dataframe and write it to csv
  results = pd.concat([data['ID'], pd.DataFrame(predictions)], axis=1)
  results.to_csv(outputf, index=False, header=False)

  # run evaluation script
  if run_eval:
    !python data/task2_eval_script_student_version.py {outputf} {inputf}

In [94]:
get_results('data/Task-2-validation-dataset.csv', 'data/10491450-Task2-method-a-validation.csv', NB_clf, run_eval=True)

Class level: 
Class  1 precision: 0.3624 recall: 0.3086
Class  2 precision: 0.4341 recall: 0.4534
Class  3 precision: 0.3913 recall: 0.3367
Class  4 precision: 0.2222 recall: 0.0833
Class  5 precision: 0.6875 recall: 0.7194
Class  6 precision: 0.3886 recall: 0.3165
Class  7 precision: 0.5583 recall: 0.6276
Class  8 precision: 0.5000 recall: 0.1290
Class  9 precision: 0.6021 recall: 0.6810
----------------------------
Movie (document) level: 
Precision: 0.5332
Recall: 0.5694


###Method B

In [95]:
# calculate class weights
class_freqs = dataset.sum(numeric_only=True).to_numpy()
class_weights = (1/class_freqs) * (class_freqs.sum()/2)
class_weights_dict = dict(enumerate(class_weights))
class_weights_dict

{0: 6.4156101426307455,
 1: 4.495558023320378,
 2: 4.060431293881645,
 3: 43.52956989247312,
 4: 2.014555859666584,
 5: 4.886240193120097,
 6: 4.036141575274177,
 7: 39.68872549019608,
 8: 2.6424608355091386}

In [96]:
# get bias initialiser
def bias_init(shape, dtype=None):
  output_bias = np.log(class_freqs/(class_freqs.sum() - class_freqs))
  return output_bias

In [97]:
# create vectorise layer for text input
vectorize_layer = TextVectorization(max_tokens=50000, output_mode="int")
vectorize_layer.adapt(X_train)

In [98]:
len(vectorize_layer.get_vocabulary())

50000

In [99]:
# build model
LSTM_clf = Sequential()
LSTM_clf.add(Input(shape=(1), dtype=tf.string))
LSTM_clf.add(vectorize_layer)
LSTM_clf.add(Embedding(input_dim=len(vectorize_layer.get_vocabulary()), output_dim=64))
LSTM_clf.add(SpatialDropout1D(0.2))
LSTM_clf.add(LSTM(64, dropout=0.2))
LSTM_clf.add(Dense(9, activation="sigmoid", bias_initializer=bias_init))

In [100]:
LSTM_clf.compile(loss=BinaryCrossentropy(), optimizer='adam', metrics=[F1Score(threshold=0.25), BinaryAccuracy(threshold=0.25)])

In [101]:
validation_data = pd.read_csv('data/Task-2-validation-dataset.csv')
X_validate = validation_data['plot_synopsis']
y_validate = validation_data[validation_data.columns[3:]].to_numpy()

In [102]:
LSTM_clf.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_5 (Embedding)     (None, None, 64)          3200000   
                                                                 
 spatial_dropout1d_2 (Spati  (None, None, 64)          0         
 alDropout1D)                                                    
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_5 (Dense)             (None, 9)                 585       
                                                                 
Total params: 3233609 (12.34 MB)
Trainable params: 323

In [103]:
history = LSTM_clf.fit(X_train, tf.cast(y_train, tf.float32), validation_data=(X_validate, tf.cast(y_validate, tf.float32)), class_weight=class_weights_dict)



In [104]:
get_results('data/Task-2-validation-dataset.csv', 'data/10491450-Task2-method-b-validation.csv', LSTM_clf, run_eval=True, threshold=0.25)

Class level: 
Class  1 precision: 0.0000 recall: 0.0000
Class  2 precision: 0.0000 recall: 0.0000
Class  3 precision: 0.0000 recall: 0.0000
Class  4 precision: 0.0000 recall: 0.0000
Class  5 precision: 0.4891 recall: 1.0000
Class  6 precision: 0.0000 recall: 0.0000
Class  7 precision: 0.0000 recall: 0.0000
Class  8 precision: 0.0000 recall: 0.0000
Class  9 precision: 0.3535 recall: 1.0000
----------------------------
Movie (document) level: 
Precision: 0.4213
Recall: 0.4508


###Results

In [105]:
get_results("data/Task-2-test-dataset2.csv", "data/10491450-Task2-method-a.csv", NB_clf)

In [106]:
get_results("data/Task-2-test-dataset2.csv", "data/10491450-Task2-method-b.csv", LSTM_clf, threshold=0.25)

