# Task 2: Text classification

## Preprocessing data for both methods

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import string
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression


# Download NLTK resources required for preproccessing
nltk.download('punkt') # Tokenizer
nltk.download('stopwords') # Stopwords list
nltk.download('wordnet') # Lemmatizer

# Initialize lemmatizer and stop words set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text) # Tokenize text
    lower = [w.lower() for w in tokens] # Convert to lower case
    alphab = [word for word in lower if word.isalpha()] # Remove punctuation
    stop = [w for w in alphab if not w in stop_words] # Filter out stop words
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in stop] # Lemmatize the words
    return ' '.join(lemmatized_tokens)

# Load datasets
train_data = pd.read_csv('./data/Training-dataset.csv')

# Comment/ uncomment for validating code with development/test dataset#

# validation_data = pd.read_csv('./data/Task-2-validation-dataset.csv')
validation_data = pd.read_csv('./data/Task-2-test-dataset1.csv')

# Preprocess text data in the 'plot_synopsis'
train_data['processed_plot'] = train_data['plot_synopsis'].apply(preprocess_text)
validation_data['processed_plot'] = validation_data['plot_synopsis'].apply(preprocess_text)

# At this point the data is ready for creating representations for classifiers


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1st method(option A): Developing traditional classification method with SVM

In [30]:
# Initialize TfidfVectorizer with maximum of 10,000 features
# Consider both unigrams and bigrams when creating feature set which provide richer representation of text data by capturing context in which words appear together
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit the vectorizer on processed text and transform data into TF-IDF vectors
train_text = tfidf_vectorizer.fit_transform(train_data['processed_plot'])

# Extract the target labels (classifications) from the training dataset
train_labels = train_data.iloc[:, 3:12] # Assuming the labels are in columns 3 to 12

# Initialize and train the MultiOutputClassifier with a LinearSVC, applying SVM with a linear kernel
svm_multi_output_classifier = MultiOutputClassifier(
    LinearSVC(
        tol=1e-3,  # optimization process will consider itself at convergence
                                     # if the change is less than 0.001 between iterations
        max_iter=10000         # maximum number of passes over the data the optimization algorithm will
                                     # take if it doesn't converge before reaching this number

    )
).fit(train_text, train_labels)  # Fit classifier to the TF-IDF transformed training data and labels

### Predicting on validation/test data for SVM

In [42]:
# Transform validation plot text into TF-IDF vectors
validation_text = tfidf_vectorizer.transform(validation_data['processed_plot'])

# Predict the classifications for the validation dataset
validation_pred_labels = svm_multi_output_classifier.predict(validation_text)

# Saving predictions to a CSV file

# Creating DataFrame with the IDs and predicted labels
svm_predicted_labels = pd.DataFrame(validation_pred_labels, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
svm_predicted_labels.insert(0, 'ID', validation_data['ID'])
svm_predicted_labels.to_csv('./data/10928627-Task2-method-a.csv', index=False, header=False)


## 2nd method(option B): Developing traditional deep learning method with LSTM

In [32]:
# train_data and validation_data are already loaded in the first cell and contain 'processed_plot' column which is preprocessed in first cell too
# Define the tokenizer with a given vocabulary size
# Initialize tokenizer to convert text into sequences of integers to prepare if for lstm model
tokenizer = Tokenizer(num_words=10000) # num_words is max number of words to keep, which can be tuned
tokenizer.fit_on_texts(train_data['processed_plot'])

# Convert the training data into sequences
train_seq = tokenizer.texts_to_sequences(train_data['processed_plot'])

# Pad sequences to ensure consistent input size
maxlen = 500  # parameter can be tuned based on length distribution of text data
pad_train_data = pad_sequences(train_seq, maxlen=maxlen)

train_label = train_data.iloc[:, 3:12].to_numpy()

# Defining LSTM model architecture
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen)) # Embedding layer to convert integer sequences to dense vectors
lstm_model.add(LSTM(64)) # LSTM layer with 64 units
lstm_model.add(Dense(9, activation='sigmoid'))  # Output layer for 9 classes with sigmoid activation

# Compile model with binary crossentropy loss function and the Adam optimizer
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model on the padded training sequences and binary labels
train_history = lstm_model.fit(
    pad_train_data,
    train_label,
    batch_size=32, # Size of mini-batch for gradient descent, tuned
    epochs=10 # Num of epochs to train the model, tuned
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Predicting on validation/test data for LSTM

In [43]:
validation_seq = tokenizer.texts_to_sequences(validation_data['processed_plot'])
pad_validation_data = pad_sequences(validation_seq, maxlen=maxlen)

# Predict on validation data
validation_pred_label = lstm_model.predict(pad_validation_data)

# Convert probabilities to binary labels on a threshold
threshold = 0.5 # tuned
binary_label = (validation_pred_label >= threshold).astype(int)

# Saving predictions to a CSV file

# Creating DataFrame with the IDs and predicted labels
lstm_pred_labels = pd.DataFrame(binary_label, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
lstm_pred_labels.insert(0, 'ID', validation_data['ID'])
lstm_pred_labels.to_csv('./data/10928627-Task2-method-b.csv', index=False, header=False)

