# Task 2


## Imports

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.layers import Layer
import keras.backend as K
import numpy as np
import nltk
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.layers import Bidirectional, Attention

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# from google.colab import drive
# drive.mount('/content/drive')

## Attention Layer Class

In [14]:
class AttentionLayer(Layer):
    """
    This layer computes attention weights for input sequences and performs a weighted sum of the sequence elements.
    """
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight',
                                 shape=(input_shape[-1], 1),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                 shape=(input_shape[1], 1),
                                 initializer='zeros',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

##  MultiLabelClassifierPlotSynopsis Class
For preprocessing, training and predicting (for both naieve and lstm models)

In [15]:
class MultiLabelClassifierPlotSynopsis:
  """
    Multi-label Classifier for Movie Plot Synopses using both LSTM and Naive Bayes models.

    This class performs preprocessing, and trainig for both LSTM and Naive Bayes models to predict the labels (genres) for a given plot synopsis.
  """
  def __init__(self, training_data_path, validation_data_path):
    """
    Initializes the MultiLabelClassifierPlotSynopsis class.

    The constructor reads training and validation datasets from CSV files, preprocesses the training data,
    and calculates the word2vec model.

    input parameters:
    training_data_path - str
        The file path to the CSV file containing the training dataset.
    validation_data_path - str
        The file path to the CSV file containing the validation dataset.
    """
    self.training_df = pd.DataFrame(pd.read_csv(training_data_path))
    self.validation_df = pd.read_csv(validation_data_path)

  def preprocess_text(self, raw):
    """
    Tokenize, convert to lowercase, remove stopwords, and lemmatize the input text.

    input paramters:
    raw - string

    output:
    preprocessed_string - string

    """
    tokens = word_tokenize(raw)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    custom_stopwords = {'movie', 'film', 'story', 'character'}
    stop_words.update(custom_stopwords)
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

  def preprocess_data(self):
    """
      Apply text preprocessing to the 'plot_synopsis' column in both training and validation datasets.
    """
    self.training_df['processed_text'] = self.training_df['plot_synopsis'].apply(self.preprocess_text)
    self.validation_df['processed_text'] = self.validation_df['plot_synopsis'].apply(self.preprocess_text)

  def tokenize_and_pad(self, max_sequence_length):
    """
      Tokenize and pad the preprocessed text data for input to the LSTM model.

      input paramters:
      max_sequence_length - int
          Maximum sequence length for padding.
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(self.training_df['processed_text'])
    X_train_seq = tokenizer.texts_to_sequences(self.training_df['processed_text'])
    X_val_seq = tokenizer.texts_to_sequences(self.validation_df['processed_text'])
    self.input_size = len(tokenizer.word_index) + 1

    self.X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
    self.X_val_pad = pad_sequences(X_val_seq, maxlen=max_sequence_length)

  def build_lstm_model(self, max_sequence_length):
    """
      Build the LSTM model.

      input paramters:
      max_sequence_length - int
          Maximum sequence length for padding.
    """
    self.model = Sequential()
    self.model.add(Embedding(input_dim=self.input_size, output_dim=100, input_length=max_sequence_length))
    self.model.add(Bidirectional(LSTM(64, return_sequences=True)))
    self.model.add(AttentionLayer())
    self.model.add(Dense(len(self.labels), activation='sigmoid'))
    self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  def train_lstm_model(self, epochs=7, batch_size=128, validation_split=0.1, patience=3):
    """
      Train the LSTM model.
      The hyper-parameters have already been experimented on prior.

      input paramters:
        epochs: int
            Number of epochs for training.

        batch_size: int
            Batch size for training.

        validation_split: float
            Fraction of training data to be used as validation data.

        patience: int
            Number of epochs with no improvement after which training will be stopped.
    """
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience)
    self.model.fit(self.X_train_pad, self.y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, callbacks=[early_stopping])

  def predict_labels_lstm(self, threshold=0.3):
    """
      Predict labels for the validation dataset using the trained LSTM model.

        input paramters:
        threshold: float
            Threshold for binary classification.

        output:
        lstm_predicted_labels_df - DataFrame
            Predicted labels for the validation dataset.
    """
    lstm_probabilities = self.model.predict(self.X_val_pad)
    lstm_predicted_labels = (lstm_probabilities >= threshold).astype(int)

    lstm_predicted_labels[:, 3] = (lstm_probabilities[:, 3] >= 0.05).astype(int)
    lstm_predicted_labels[:, 7] = (lstm_probabilities[:, 7] >= 0.1).astype(int)

    lstm_predicted_labels_df = pd.DataFrame(lstm_predicted_labels, columns=self.labels)
    lstm_predicted_labels_df['ID'] = self.validation_df['ID']
    lstm_predicted_labels_df = lstm_predicted_labels_df[['ID'] + [col for col in lstm_predicted_labels_df.columns if col != 'ID']]

    return lstm_predicted_labels_df

  def run_lstm(self, output_path, max_sequence_length=4000, epochs=7, batch_size=128, validation_split=0.1, patience=3, threshold=0.3):
    """
        Run the entire pipeline for the LSTM model.
        The hyper-parameters have already been experimented on prior.

        input paramters:
        output_path: str
            File path to save the predicted labels.

        max_sequence_length: int
            Maximum sequence length for padding.

        epochs: int
            Number of epochs for training.

        batch_size: int
            Batch size for training.

        validation_split: float
            Fraction of training data to be used as validation data.

        patience: int
            Number of epochs with no improvement after which training will be stopped.

        threshold: float
            Threshold for binary classification.
    """
    self.labels = self.training_df.columns[3:]
    self.preprocess_data()
    self.tokenize_and_pad(max_sequence_length)
    self.y_train = self.training_df[self.labels]

    self.build_lstm_model(max_sequence_length)
    self.train_lstm_model(epochs=epochs, batch_size=batch_size, validation_split=validation_split, patience=patience)

    lstm_predicted_labels_df = self.predict_labels_lstm(threshold=threshold)

    lstm_predicted_labels_df.to_csv(output_path, header=False, index=False)


  def train_naive_bayes_classifier(self):
    """
    Train a Naive Bayes classifier using CountVectorizer.

    Returns:
    Pipeline: Trained pipeline containing CountVectorizer and Multinomial Naive Bayes classifier.
    """
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(max_df=0.75, min_df=0.01, ngram_range=(1, 3))),
        ('classifier', OneVsRestClassifier(MultinomialNB(alpha=10)))
    ])

    X = self.training_df['processed_text']
    y_columns = self.training_df.drop(['ID', 'title', 'plot_synopsis', 'processed_text'], axis=1)
    y = y_columns
    pipeline.fit(X, y)
    return pipeline

  def predict_naive_bayes(self, pipeline, threshold=0.5):
    """
    Predict labels for the validation dataset using the Naive Bayes classifier.

    Parameters:
    pipeline (Pipeline): Trained pipeline containing CountVectorizer and Multinomial Naive Bayes classifier.
    threshold (float): Threshold for binary classification.

    Returns:
    DataFrame: Predicted labels for the validation dataset.
    """
    X_val = self.validation_df['processed_text']
    probabilities = pipeline.predict_proba(X_val)

    predicted_labels = np.array([(probabilities >= threshold).astype(int) for prob in probabilities])

    predicted_labels_df = pd.DataFrame(predicted_labels[0], columns=self.labels)
    predicted_labels_df['ID'] = self.validation_df['ID'].values
    predicted_labels_df = predicted_labels_df[['ID'] + [col for col in predicted_labels_df.columns if col != 'ID']]
    return predicted_labels_df




## validation

### lstm

In [16]:
lstm_classifier = MultiLabelClassifierPlotSynopsis('/content/drive/MyDrive/data/Training-dataset.csv', '/content/drive/MyDrive/data/Task-2-validation-dataset.csv')
lstm_classifier.run_lstm('/content/drive/MyDrive/data/10867903-Task2-method-b-validation.csv')


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### naive

In [17]:

naive_bayes_classifier = lstm_classifier.train_naive_bayes_classifier()
naive_bayes_predicted_labels_df = lstm_classifier.predict_naive_bayes(naive_bayes_classifier, threshold=0.5)
naive_bayes_predicted_labels_df.to_csv('/content/drive/MyDrive/data/10867903-Task2-method-a-validation.csv', header=False, index=False)


## Test
### lstm

In [18]:
lstm_classifier = MultiLabelClassifierPlotSynopsis('/content/drive/MyDrive/data/Training-dataset.csv', '/content/drive/MyDrive/data/Task-2-test-dataset1.csv')
lstm_classifier.run_lstm('/content/drive/MyDrive/data/10867903-Task2-method-b.csv')

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### naiev


In [19]:

naive_bayes_classifier = lstm_classifier.train_naive_bayes_classifier()
naive_bayes_predicted_labels_df = lstm_classifier.predict_naive_bayes(naive_bayes_classifier, threshold=0.5)
naive_bayes_predicted_labels_df.to_csv('/content/drive/MyDrive/data/10867903-Task2-method-a.csv', header=False, index=False)