# TASK 2

### Import Libraries and Download Packages

In [25]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import multilabel_confusion_matrix
from scipy.sparse import csr_matrix

from nltk import word_tokenize, download
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow as tf
import pandas as pd
import numpy as np

In [26]:
# Download "stopwords" and "punkt" NLTK packages
download("stopwords")
download("punkt")
download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Data Initialisation

In [33]:
# Training data (replace filepath if change is needed)
train_data = pd.read_csv("./data/Training-dataset.csv")

# Testing/Validation data (replace filepath if change is needed)
# test_data = pd.read_csv("./data/Task-2-validation-dataset.csv")
test_data = pd.read_csv("./data/Task-2-test-dataset1.csv")

documents = train_data["plot_synopsis"].to_numpy()
labels = train_data.iloc[:, 3:].to_numpy()

id = test_data["ID"].to_numpy()
test_documents = test_data["plot_synopsis"].to_numpy()
test_labels = test_data.iloc[:, 3:].to_numpy()

### Text Preprocesing Function

In [13]:
def text_preprocessing(document,
                       caseFolding=False,
                       removeStopwords=False,
                       useLemmatizer=False,
                       useStemmer=False):
  # Tokenization
  tokens = word_tokenize(document)

  # Case folding - convert every tokens to lowercase
  if caseFolding:
    tokens = [t.lower() for t in tokens]

  # Stop-words removal
  if removeStopwords:
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if not t.lower() in stop_words]

  # Lemmatization
  if useLemmatizer:
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

  # Stemming
  if useStemmer:
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]

  preprocessed_document = " ".join(tokens)

  return preprocessed_document

## a) Support Vector Machine (SVM)


### Words Preprocessing

In [14]:
# Documents preprocessing
# Estimated time : 40-90s

preprocessed_docs = [text_preprocessing(d, False, True) for d in documents]

### Model Implementation

In [15]:
# Unigram, changeable
n = 1

# Represent raw data in numerical form
vectorizer = CountVectorizer(ngram_range=(n, n))
# vectorizer = TfidfVectorizer(ngram_range=(n, n))
matrixBoW = vectorizer.fit_transform(preprocessed_docs)

matrixBoW

<8257x90193 sparse matrix of type '<class 'numpy.int64'>'
	with 2430560 stored elements in Compressed Sparse Row format>

In [16]:
labels.shape

(8257, 9)

In [17]:
# Unigram, changeable
# Estimated Time : 15-20 minutes
n = 1

# Represent raw data in numerical form
vectorizer = CountVectorizer(ngram_range=(n, n))
# vectorizer = TfidfVectorizer(ngram_range=(n, n))
matrixBoW = vectorizer.fit_transform(preprocessed_docs)

# Model
svm = SVC(kernel='linear')

# Fit one classifier per target for multi-label classification
classifier = MultiOutputClassifier(svm, n_jobs=-1).fit(matrixBoW, labels)

### Classification Prediction

Test Document Preprocessing

In [21]:
preprocessed_test_docs = [text_preprocessing(d, False, True) for d in test_documents]

Vectorization of Preprocessed Test Documents

In [22]:
# Represent raw data in numerical form
test_matrixBoW = vectorizer.transform(preprocessed_test_docs)

# Predicts test documents' labels
# Estimated time : 90 - 120 seconds
prediction = classifier.predict(test_matrixBoW)

In [204]:
# Insert ID column from test data into result
converted_prediction = prediction.astype(object)
combined_result = np.insert(converted_prediction, 0, id, axis=1)

# Create dataframe of the result
result_df = pd.DataFrame(combined_result)
result_df.columns = ["ID", "comedy", "cult", "flashback", "historical",
                     "murder", "revenge", "romantic", "scifi", "violence"]
result_df.to_csv("10812451-Task2-method-a.csv", index=False, header=False)
result_df.head()

Unnamed: 0,ID,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,9484ac61-0e30-4799-9998-6f74f4cbb204,0,0,0,0,0,0,0,0,1
1,55942d28-b6a2-4662-ab55-a66783a86a56,0,0,0,0,0,0,0,0,0
2,b71ed317-04cd-42f5-a380-d21dfea2bd36,0,0,0,0,1,0,1,0,0
3,5689b1b2-88cd-4c22-9114-0850ba539280,1,1,1,0,0,0,0,0,0
4,a0d9062e-f539-4043-bc9e-2a2ed589477b,0,0,0,0,1,0,0,0,0


## b) Bi-directional LSTM

### Words Preprocessing and Encoding

In [28]:
# Documents preprocessing and encoding
# Estimated time : 40-90s

preprocessed_docs = [text_preprocessing(d, False, True, True) for d in documents]

encoder = tf.keras.layers.TextVectorization(max_tokens=None,
                                            output_mode='int')

encoder.adapt(preprocessed_docs)
vocabulary = np.array(encoder.get_vocabulary())
vocab_size = encoder.vocabulary_size()
encoded_docs = encoder(preprocessed_docs)
label_size = labels.shape[1]

In [29]:
print("First 20 word in vocabulary:\n", vocabulary[:20])
print("\nVocabulary size:", vocab_size)
print("\nFirst 20 encoded documents:\n", encoded_docs[:20])
print("\nNumber of labels:", label_size)

First 20 word in vocabulary:
 ['' '[UNK]' 's' 'tell' 'go' 'find' 'one' 'get' 'take' 'back' 'see' 'two'
 'man' 'time' 'nt' 'kill' 'house' 'life' 'father' 'make']

Vocabulary size: 120366

First 20 encoded documents:
 tf.Tensor(
[[ 1826  1913  1077 ...     0     0     0]
 [ 7558    90 31367 ...     0     0     0]
 [90584  7605  2825 ...     0     0     0]
 ...
 [77192  9524 13337 ...     0     0     0]
 [ 1192   594     9 ...     0     0     0]
 [   58   329   751 ...     0     0     0]], shape=(20, 2788), dtype=int64)

Number of labels: 9


### Model Implementation

In [30]:
# Defining the model
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu', name="hidden_layer"),
    tf.keras.layers.Dense(label_size, activation='sigmoid', name='output')
])

# Output summary of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          7703424   
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               66048     
 onal)                                                           
                                                                 
 hidden_layer (Dense)        (None, 64)                8256      
                                                                 
 output (Dense)              (None, 9)                 585       
                                                                 
Total params: 7778313 (29.67 MB)
Trainable params: 777

In [31]:
# Model configuration
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.metrics.binary_crossentropy,
              metrics=tf.keras.metrics.F1Score(average="macro",
                                               threshold=0.5,
                                               name='f1_score',
                                               dtype=None)
              )

labels = tf.cast(labels, tf.float32)
labels

<tf.Tensor: shape=(8257, 9), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)>

In [32]:
# Stop training early to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=0,
                                                  restore_best_weights=True)

# Train the model
history = model.fit(documents,
                    labels,
                    epochs=5,
                    validation_split=0.2,
                    validation_steps=15,
                    callbacks=[early_stopping]
                    )

Epoch 1/5
Epoch 2/5
Epoch 3/5


### Classification Prediction

In [24]:
# Prediction
# Threshold is fixed at 0.3
# Do not change threshold
prediction = (model.predict(test_documents) > 0.3).astype('int')



In [11]:
# Insert ID column from test data into result
converted_prediction = prediction.astype(object)
combined_result = np.insert(converted_prediction, 0, id, axis=1)

# Create dataframe of the result
result_df = pd.DataFrame(combined_result)
result_df.columns = ["ID", "comedy", "cult", "flashback", "historical",
                     "murder", "revenge", "romantic", "scifi", "violence"]
result_df.to_csv("10812451-Task2-method-b.csv", index=False, header=False)
result_df

Unnamed: 0,ID,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,9484ac61-0e30-4799-9998-6f74f4cbb204,0,0,0,0,1,0,0,0,1
1,55942d28-b6a2-4662-ab55-a66783a86a56,0,0,0,0,0,0,1,0,0
2,b71ed317-04cd-42f5-a380-d21dfea2bd36,0,0,0,0,0,0,1,0,0
3,5689b1b2-88cd-4c22-9114-0850ba539280,0,0,0,0,1,0,0,0,1
4,a0d9062e-f539-4043-bc9e-2a2ed589477b,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1195,8978047a-ec54-412a-bcee-070fe1fb055c,0,0,1,0,1,0,1,0,1
1196,f1f04933-e298-4f65-bbeb-bc61a567a688,0,0,0,0,1,0,0,0,1
1197,a033955d-12c2-4549-bafd-ca8e84615f1b,0,0,0,0,1,0,0,0,1
1198,9464e84d-36b6-4b69-b0fb-f3c0546a8b10,0,0,0,0,1,0,0,0,0
