# **NOTEBOOK WAS RUN USING THE T4 GPU**, so all timings in the questionnaire is from this GPU

# Import Necessary Libraries

In [1]:
import pandas as pd
import time
import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import string
import numpy as np

from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow import keras
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPool1D, BatchNormalization
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
# Load nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print("Imported all necessary libraries successfully")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Imported all necessary libraries successfully


# Data Processing (Cleaning & Preprocessing)




In [2]:
def preprocess_text(text, remove_stopwords=True, lemmatize=True, stem=False):

   # Set up stop words, lemmatizer, and stemmer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")

    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lowercase and remove non-alphanumeric tokens i.e. punctuation
    tokens = [token.lower() for token in tokens if token.isalnum()]
    # Remove stopwords
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize
    if lemmatize:
        # Since the lemmatizer as its default pos is noun, then the lemma of media is medium so it gets removed from the corpus
        tokens = [lemmatizer.lemmatize(token) if token != 'media' else token for token in tokens]
    # Stem
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into a single string
    processed_text = " ".join(tokens)

    return processed_text


# Data Preparation

### Training Data

In [3]:
# Load the dataset
training_dataset_path = './data/Training-dataset.csv'
training_data = pd.read_csv(training_dataset_path)

# Apply preprocessing to each plot synopsis
training_data['processed_plot_synopsis'] = training_data['plot_synopsis'].apply(preprocess_text, stem=False)

genres_columns = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

train_synopses = training_data['processed_plot_synopsis']
train_labels = training_data[genres_columns].values

### Validation Data

In [4]:
validation_dataset_path = './data/Task-2-validation-dataset.csv'
validation_data = pd.read_csv(validation_dataset_path)
# Apply preprocessing to each plot synopsis
validation_data['processed_plot_synopsis'] = validation_data['plot_synopsis'].apply(preprocess_text, stem=False)

val_synopses = validation_data['processed_plot_synopsis']
val_labels = validation_data[genres_columns].values

### Testing Data

In [5]:
testing_dataset_path = './data/Task-2-test-dataset1.csv'
testing_data = pd.read_csv(testing_dataset_path)
# Apply preprocessing to each plot synopsis
testing_data['processed_plot_synopsis'] = testing_data['plot_synopsis'].apply(preprocess_text, stem=False)

test_synopses = testing_data['processed_plot_synopsis']

In [6]:
def create_prediction_csv(predicted_labels, method, dataset):
  if dataset == 'validation':
    prediction_data = validation_data.copy()
    prediction_data.drop(prediction_data.columns[1:], axis=1, inplace=True)
    prediction_data[genres_columns] = predicted_labels
    # Save the new dataset to a CSV file
    prediction_path = f'./data/10726993-Task2-method-{method}-validation.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)
  if dataset == 'testing':
    prediction_data = testing_data.copy()
    prediction_data.drop(prediction_data.columns[1:], axis=1, inplace=True)
    prediction_data[genres_columns] = predicted_labels
    # Save the new dataset to a CSV file
    prediction_path = f'./data/10726993-Task2-method-{method}.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)

# Method A - SVM

In [8]:
tfidf_vectorizer = TfidfVectorizer()
train_synopses_tfidf = tfidf_vectorizer.fit_transform(train_synopses)
val_synopses_tfidf = tfidf_vectorizer.transform(val_synopses)
test_synopses_tfidf = tfidf_vectorizer.transform(test_synopses)

In [15]:
test_synopses_tfidf.shape

(1200, 80387)

In [9]:
# Initialize SVM with OneVsRestClassifier
svm_classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))

In [10]:
# Train
svm_classifier.fit(train_synopses_tfidf, train_labels)

In [26]:
# Predict validation dataset
pred_prob_val = svm_classifier.predict_proba(val_synopses_tfidf)

In [33]:
from sklearn.metrics import precision_recall_curve, f1_score

n_labels = val_labels.shape[1]
precision_dict = {}
recall_dict = {}
f1_scores = []
optimal_thresholds = []
for i in range(n_labels):
    # Calculate precision and recall for each label
    precision_dict[i], recall_dict[i], thresholds = precision_recall_curve(val_labels[:, i], pred_prob_val[:, i])


    # Find optimal threshold for F1 score for each label
    f1_score_values = 2 * (precision_dict[i] * recall_dict[i]) / (precision_dict[i] + recall_dict[i])
    optimal_idx = np.nanargmax(f1_score_values)  # nanargmax handles NaN values
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 1.0
    optimal_thresholds.append(optimal_threshold)
    f1_scores.append(f1_score_values[optimal_idx])

    print(f"Label {i}: Optimal Threshold: {optimal_threshold}, F1 Score: {f1_score_values[optimal_idx]}")

Label 0: Optimal Threshold: 0.1519856575255218, F1 Score: 0.39441535776614306
Label 1: Optimal Threshold: 0.23881594070644435, F1 Score: 0.46504065040650405
Label 2: Optimal Threshold: 0.22301939363474063, F1 Score: 0.45223700120918986
Label 3: Optimal Threshold: 0.24684111920397453, F1 Score: 0.2777777777777778
Label 4: Optimal Threshold: 0.28829850282193564, F1 Score: 0.7407932011331446
Label 5: Optimal Threshold: 0.19466419720471428, F1 Score: 0.43543543543543545
Label 6: Optimal Threshold: 0.3892415826721485, F1 Score: 0.6198198198198198
Label 7: Optimal Threshold: 0.15691063313922315, F1 Score: 0.3859649122807018
Label 8: Optimal Threshold: 0.302798735529674, F1 Score: 0.6647173489278753


In [43]:
optimal_thresholds

[0.1519856575255218,
 0.23881594070644435,
 0.22301939363474063,
 0.24684111920397453,
 0.28829850282193564,
 0.19466419720471428,
 0.3892415826721485,
 0.15691063313922315,
 0.302798735529674]

In [18]:
def apply_thresholds(probabilities, thresholds):
  # Ensure probabilities and thresholds are numpy arrays
    probabilities = np.array(probabilities)
    thresholds = np.array(thresholds)

    # Initialize an array to store the predicted labels
    labels = np.zeros_like(probabilities, dtype=int)

    # Apply each threshold to the corresponding column in the probabilities array
    for i, threshold in enumerate(thresholds):
        labels[:, i] = (probabilities[:, i] >= threshold).astype(int)

    return labels

In [38]:
# Predict validation dataset
pred_labels_val = apply_thresholds(pred_prob_val, optimal_thresholds)

In [42]:
# Predict testing dataset
pred_prob_test = svm_classifier.predict_proba(test_synopses_tfidf)
pred_labels_test = apply_thresholds(pred_prob_test, optimal_thresholds)

In [39]:
create_prediction_csv(pred_labels_val, "a", "validation")

In [44]:
create_prediction_csv(pred_labels_test, "a", "testing")

### Results SVM with linear kernel

In [40]:
from sklearn.metrics import classification_report
print(classification_report(val_labels, pred_labels_val, target_names=genres_columns))

              precision    recall  f1-score   support

      comedy       0.28      0.65      0.39       175
        cult       0.39      0.58      0.47       247
   flashback       0.35      0.64      0.45       294
  historical       0.42      0.21      0.28        24
      murder       0.63      0.90      0.74       581
     revenge       0.34      0.61      0.44       237
    romantic       0.65      0.59      0.62       290
       scifi       0.42      0.35      0.39        31
    violence       0.56      0.81      0.66       420

   micro avg       0.47      0.71      0.57      2299
   macro avg       0.45      0.59      0.49      2299
weighted avg       0.50      0.71      0.58      2299
 samples avg       0.50      0.72      0.54      2299



  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
%run task2_eval_script_student_version.py ./data/10726993-Task2-method-a-validation.csv ./data/Task-2-validation-dataset.csv


Class level: 
Class  1 precision: 0.2839 recall: 0.6457 f1:  0.3944
Class  2 precision: 0.3886 recall: 0.5789 f1:  0.4650
Class  3 precision: 0.3508 recall: 0.6361 f1:  0.4522
Class  4 precision: 0.4167 recall: 0.2083 f1:  0.2778
Class  5 precision: 0.6294 recall: 0.9002 f1:  0.7408
Class  6 precision: 0.3380 recall: 0.6118 f1:  0.4354
Class  7 precision: 0.6491 recall: 0.5931 f1:  0.6198
Class  8 precision: 0.4231 recall: 0.3548 f1:  0.3860
Class  9 precision: 0.5627 recall: 0.8119 f1:  0.6647
----------------------------
Movie (document) level: 
Precision: 0.4984
Recall: 0.7203
F1: nan


  movie_level_f1 = (2*movie_level_precision*movie_level_recall)/(movie_level_precision*movie_level_recall)


# Method B - Bi-direcitonal LSTM

In [7]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the text data
vectorizer.fit(train_synopses)

# Retrieve the vocabulary and determine its size
vocabulary = vectorizer.vocabulary_
vocabulary_size = len(vocabulary)

In [8]:
# Tokenize the text
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(train_synopses)
train_synopses_seq = tokenizer.texts_to_sequences(train_synopses)
max_len = max([len(seq) for seq in train_synopses_seq])
train_synopses_pad = pad_sequences(train_synopses_seq, maxlen=max_len)

In [101]:
model = Sequential([
        Embedding(input_dim=vocabulary_size, output_dim=128),
        Bidirectional(LSTM(128, return_sequences=True)),
        GlobalMaxPool1D(),
        Dense(train_labels.shape[1],activation='sigmoid')
    ])

In [102]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [56]:
val_synopses_seq = tokenizer.texts_to_sequences(val_synopses)
max_len = max([len(seq) for seq in val_synopses_seq])
val_synopses_pad = pad_sequences(val_synopses_seq, maxlen=max_len)

In [103]:
# Train the model
model.fit(train_synopses_pad, train_labels, batch_size=32, epochs=4, validation_data=(val_synopses_pad, val_labels) )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x781e1cd22bc0>

Validation Predict

In [104]:
labels_prob_val = model.predict(val_synopses_pad)



In [105]:
from sklearn.metrics import precision_recall_curve, f1_score

n_labels = val_labels.shape[1]
precision_dict = {}
recall_dict = {}
f1_scores = []
optimal_thresholds = []
for i in range(n_labels):
    # Calculate precision and recall for each label
    precision_dict[i], recall_dict[i], thresholds = precision_recall_curve(val_labels[:, i], labels_prob_val[:, i])


    # Find optimal threshold for F1 score for each label
    f1_score_values = 2 * (precision_dict[i] * recall_dict[i]) / (precision_dict[i] + recall_dict[i])
    optimal_idx = np.nanargmax(f1_score_values)  # nanargmax handles NaN values
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 1.0
    optimal_thresholds.append(optimal_threshold)
    f1_scores.append(f1_score_values[optimal_idx])

    print(f"Label {i}: Optimal Threshold: {optimal_threshold}, F1 Score: {f1_score_values[optimal_idx]}")

Label 0: Optimal Threshold: 0.18948140740394592, F1 Score: 0.3696369636963696
Label 1: Optimal Threshold: 0.2561385929584503, F1 Score: 0.45569620253164556
Label 2: Optimal Threshold: 0.1855451911687851, F1 Score: 0.4705882352941176
Label 3: Optimal Threshold: 0.012305769138038158, F1 Score: 0.07865168539325842
Label 4: Optimal Threshold: 0.06390566378831863, F1 Score: 0.7363957597173145
Label 5: Optimal Threshold: 0.3031166195869446, F1 Score: 0.458259325044405
Label 6: Optimal Threshold: 0.3389400541782379, F1 Score: 0.5949820788530467
Label 7: Optimal Threshold: 0.0513652078807354, F1 Score: 0.12345679012345678
Label 8: Optimal Threshold: 0.7265446782112122, F1 Score: 0.6453089244851259


  f1_score_values = 2 * (precision_dict[i] * recall_dict[i]) / (precision_dict[i] + recall_dict[i])
  f1_score_values = 2 * (precision_dict[i] * recall_dict[i]) / (precision_dict[i] + recall_dict[i])


In [110]:
pred_labels_val = apply_thresholds(labels_prob_val, optimal_thresholds)

In [112]:
create_prediction_csv(pred_labels_val, "b", "validation")

In [119]:
%run task2_eval_script_student_version.py ./data/10726993-Task2-method-b-validation.csv ./data/Task-2-validation-dataset.csv

Class level: 
Class  1 precision: 0.2599 recall: 0.6400 f1:  0.3696
Class  2 precision: 0.3315 recall: 0.7287 f1:  0.4557
Class  3 precision: 0.4069 recall: 0.5578 f1:  0.4706
Class  4 precision: 0.0422 recall: 0.5833 f1:  0.0787
Class  5 precision: 0.6247 recall: 0.8967 f1:  0.7364
Class  6 precision: 0.3957 recall: 0.5443 f1:  0.4583
Class  7 precision: 0.6194 recall: 0.5724 f1:  0.5950
Class  8 precision: 0.0763 recall: 0.3226 f1:  0.1235
Class  9 precision: 0.6211 recall: 0.6714 f1:  0.6453
----------------------------
Movie (document) level: 
Precision: 0.4402
Recall: 0.6992
F1: nan


  movie_level_f1 = (2*movie_level_precision*movie_level_recall)/(movie_level_precision*movie_level_recall)


Testing predict


In [114]:
test_synopses_seq = tokenizer.texts_to_sequences(test_synopses)
max_len = max([len(seq) for seq in test_synopses_seq])
test_synopses_pad = pad_sequences(test_synopses_seq, maxlen=max_len)

In [115]:
labels_prob_test = model.predict(test_synopses_pad)



In [116]:
pred_labels_test = apply_thresholds(labels_prob_test, optimal_thresholds)

In [118]:
create_prediction_csv(pred_labels_test, "b", "testing")