# 1. Import Packages

In [24]:
import os
import string
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Embedding, Bidirectional, Dense, Dropout, GlobalMaxPool1D, LSTM

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. Create Functions & Class

In [26]:
class TextClassification:

  def __init__(self) -> None:
    self.method = 0

  def preprocess_data(self, document: list) -> list:
    """
    Applies the following to the data: tokenization, stop words, stemming, punctuation & lowercase
    :param document: list of data to be cleaned
    :return: list of cleaned data
    """

    # Tokenization
    tokenized = nltk.tokenize.word_tokenize(document)

    # Stop words
    stop_words = set(nltk.corpus.stopwords.words("english"))
    cleaned_words = [word for word in tokenized if word not in stop_words]

    # Stemming
    porter_stemmer = nltk.stem.PorterStemmer()
    cleaned_words = [porter_stemmer.stem(word) for word in cleaned_words]

    # Punctuation
    punctuations = set(string.punctuation)
    cleaned_words = [word for word in cleaned_words if word not in punctuations]

    # Lowercase
    cleaned_words = ' '.join([term.lower() for term in cleaned_words])

    return cleaned_words

  def classification(self, train_data: list, test_data: list) -> pd.DataFrame:
    """
    Traditional classifier using SVM with preprocessing
    :param train_data: training data for the SVM
    :param test_data: testing data for the SVM
    :return: a dataframe with id and predicted scores for categories
    """

    # Preprocess data first
    X = [self.preprocess_data(doc) for doc in train_data['plot_synopsis']]
    y = [self.preprocess_data(doc) for doc in test_data['plot_synopsis']]

    categories = train_data.columns[3:]

    # CSV template
    results = pd.DataFrame({
        'ID': test_data['ID'],
        'comedy': 0,
        'cult': 0,
        'flashback': 0,
        'historical': 0,
        'murder': 0,
        'revenge': 0,
        'romantic': 0,
        'scifi': 0,
        'violence': 0,
    })

    # Pipeline, uses tfidf and then multi-label classifies
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', min_df=5, ngram_range=(1,2), sublinear_tf=True)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    ])

    # For each category calculate predictions
    for category in categories:
      pipeline.fit(X, train_data[category])
      prediction = pipeline.predict(y)
      results[category] = prediction

    # Write results
    results.to_csv(f'results_{self.method}.csv', header=False, index=False)

    return results

  def deep_learning(self, train_data: list, test_data: list):
    """
    Deep learning classifier using LSTM with preprocessing
    :param train_data: training data for the LSTM
    :param test_data: testing data for the LSTM
    :return: a dataframe with id and predicted scores for categories
    """

    # Preprocess data first
    X = [self.preprocess_data(doc) for doc in train_data['plot_synopsis']]
    y = [self.preprocess_data(doc) for doc in test_data['plot_synopsis']]

    # Load & save if needed
    # np.save('X.npy', X)
    # np.save('y.npy', y)
    # X = np.load('X.npy')
    # y = np.load('y.npy')

    categories = train_data.columns[3:]
    max_len = 1200

    # Tokenize & vocab
    tokenize = Tokenizer()
    tokenize.fit_on_texts(train_data['plot_synopsis'])
    vocab_size = len(tokenize.word_index) + 1

    train_encoded = tokenize.texts_to_sequences(X)
    train_padded = pad_sequences(train_encoded, padding='post', maxlen=max_len)

    test_encoded = tokenize.texts_to_sequences(y)
    test_padded = pad_sequences(test_encoded, padding='post', maxlen=max_len)

    # LSTM params
    lstm_size = 128
    dense_val = 9
    dropout_val = 0.1

    # Create model
    model = Sequential([
        Embedding(vocab_size, lstm_size, input_length=max_len),
        Bidirectional(LSTM(lstm_size, return_sequences=True)),
        Dropout(dropout_val),
        Bidirectional(LSTM(int(lstm_size/2))),
        Dropout(dropout_val),
        Dense(dense_val, activation='sigmoid'),
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy')
    print(model.summary())

    history = model.fit(train_padded, train_data[categories], batch_size=32, epochs=5)

    # Save & load model if needed
    # model.save('model.keras')
    # model = load_model('model.keras')

    probabilities = model.predict(test_padded)
    predictions = (probabilities > 0.5).astype(int)

    # CSV template
    results = pd.DataFrame(data=predictions, columns=categories)
    results.insert(0, 'ID', test_data['ID'], True)

    # Write results
    results.to_csv(f'results_{self.method}.csv', header=False, index=False)

    return results


# 3. Traditional Classification (SVM)

In [29]:
def main() -> None:
  tc = TextClassification()

  tc.method = 0

  files_path = './data/'
  train_data_name = 'Training-dataset.csv'
  test_data_name = 'Task-2-test-dataset1.csv'

  train_data = pd.read_csv(f'{files_path}/{train_data_name}')
  test_data = pd.read_csv(f'{files_path}/{test_data_name}')

  cl = tc.classification(train_data, test_data)
  print(cl)

test = main()

                                        ID  comedy  cult  flashback  \
0     9484ac61-0e30-4799-9998-6f74f4cbb204       0     0          0   
1     55942d28-b6a2-4662-ab55-a66783a86a56       0     0          0   
2     b71ed317-04cd-42f5-a380-d21dfea2bd36       0     0          0   
3     5689b1b2-88cd-4c22-9114-0850ba539280       0     0          0   
4     a0d9062e-f539-4043-bc9e-2a2ed589477b       0     0          0   
...                                    ...     ...   ...        ...   
1195  8978047a-ec54-412a-bcee-070fe1fb055c       0     1          0   
1196  f1f04933-e298-4f65-bbeb-bc61a567a688       0     0          0   
1197  a033955d-12c2-4549-bafd-ca8e84615f1b       0     0          0   
1198  9464e84d-36b6-4b69-b0fb-f3c0546a8b10       0     0          0   
1199  93ec8a32-0f64-4965-ba02-5b369ed16ca4       0     0          0   

      historical  murder  revenge  romantic  scifi  violence  
0              0       0        0         0      0         0  
1              0     

# 4. Deep Learning (Bi-LSTM)

In [30]:
def main() -> None:
  tc = TextClassification()

  tc.method = 1

  files_path = './data/'
  train_data_name = 'Training-dataset.csv'
  test_data_name = 'Task-2-test-dataset1.csv'

  train_data = pd.read_csv(f'{files_path}/{train_data_name}')
  test_data = pd.read_csv(f'{files_path}/{test_data_name}')

  cl = tc.deep_learning(train_data, test_data)
  print(cl)

test = main()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 1200, 128)         13876736  
                                                                 
 bidirectional_19 (Bidirect  (None, 1200, 256)         263168    
 ional)                                                          
                                                                 
 dropout_30 (Dropout)        (None, 1200, 256)         0         
                                                                 
 bidirectional_20 (Bidirect  (None, 128)               164352    
 ional)                                                          
                                                                 
 dropout_31 (Dropout)        (None, 128)               0         
                                                                 
 dense_20 (Dense)            (None, 9)               