10699403 - Task 2 methods a and b

In [None]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

# Download NLTK data
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [1]:
# Function to preprocess data by converting to lowercase, removing non-alphanumeric characters, tokenizing, removing stopwords, and lemmatizing
def preprocess_data(data):
  lemmatizer = WordNetLemmatizer()
  stopWords = set(stopwords.words('english'))

  # Convert to lowercase
  data = data.lower()
  # Remove non-alphanumeric characters
  data = re.sub(r"[^\w\d'\s]+", '', data)

  # Tokenize the data
  tokens = word_tokenize(data)
  # Remove stopwords
  cleaned_tokens =  [token for token in tokens if token not in stopWords]

  #Lemmatize the tokens
  final_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]

  return ' '.join(final_tokens)

In [3]:
# Read in the training data from the CSV file
training_data = pd.read_csv('./data/Training-dataset.csv')
# Apply preprocessing to the training data
processed_document_data = training_data['title'] + ' ' + training_data['plot_synopsis'].apply(preprocess_data)
genres = training_data[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values
genres_list = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

In [None]:
# Create a pipeline for Naive Bayes classifier using TF-IDF features
nb_pipeline = make_pipeline(TfidfVectorizer(max_features=10000, ngram_range=(1, 4), sublinear_tf=True, use_idf=False),
                            MultiOutputClassifier(MultinomialNB(alpha=0.01, fit_prior=False)))
nb_pipeline.fit(processed_document_data, genres)

In [None]:
# Tokenize and pad sequences for input to the LSTM model
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(processed_document_data)
X_train_tokenized = tokenizer.texts_to_sequences(processed_document_data)
X_train_padded = pad_sequences(X_train_tokenized, maxlen=450)

# Build a Sequential model with an Embedding layer, Bidirectional LSTM layer, and Dense output layer
model = Sequential()
model.add(Embedding(25001, 128, input_length=450))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(genres.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X_train_padded, genres, epochs=2, batch_size=32, validation_split=0.1)

In [14]:
# Read validation data from a CSV file
validation_data = pd.read_csv('/data/Task-2-validation-dataset.csv')
# Apply preprocessing to the validation data so that it is in consistent format to the training data
validation_document_data = validation_data['title'] + ' ' + validation_data['plot_synopsis'].apply(preprocess_data)

In [15]:
# Predict genres using Naive Bayes classifier for the validation data and save the results to a CSV file
validation_nb_prediction = nb_pipeline.predict(validation_document_data)
validation_nb_prediction_df = pd.DataFrame(validation_nb_prediction, columns=genres_list)
validation_nb_output = pd.concat([validation_data[['ID']], validation_nb_prediction_df], axis=1)
validation_nb_output.to_csv("./data/10699403-Task2-method-a-validation.csv", header=False, index=False)

In [None]:
# Tokenize and pad sequences for the validation set to input to the LSTM model and get the LSTM prediction
validation_lstm_tokenized = tokenizer.texts_to_sequences(validation_document_data)
validation_lstm_pad_tokenized = pad_sequences(validation_lstm_tokenized, maxlen=450)
validation_lstm_prediction = model.predict(validation_lstm_pad_tokenized)

# Convert validation predictions to binary format (each genre 0 or 1 instead of probability) and save results to a CSV file
validation_lstm_binary_predictions = (validation_lstm_prediction >= 0.3).astype(int)
validation_lstm_predicted_labels = pd.DataFrame(validation_lstm_binary_predictions, columns=training_data.columns[3:])
validation_lstm_output = pd.concat([validation_data[['ID']], validation_lstm_predicted_labels], axis=1)
validation_lstm_output.to_csv("./data/10699403-Task2-method-b-validation.csv", header=False, index=False)

In [17]:
# Read test data from a CSV file
test_data = pd.read_csv('./data/Task-2-test-dataset1.csv')
# Apply preprocessing to the test data so that it is in consistent format to the training data
test_document_data = test_data['title'] + ' ' + test_data['plot_synopsis'].apply(preprocess_data)

In [18]:
# Predict genres using Naive Bayes classifier for the test data and save the results to a CSV file
test_nb_prediction = nb_pipeline.predict(test_document_data)
test_nb_prediction_df = pd.DataFrame(test_nb_prediction, columns=genres_list)
test_nb_output = pd.concat([test_data[['ID']], test_nb_prediction_df], axis=1)
test_nb_output.to_csv("./data/10699403-Task2-method-a.csv", header=False, index=False)

In [None]:
# Tokenize and pad sequences for the test set to input to the LSTM model and get the LSTM prediction
test_lstm_tokenized = tokenizer.texts_to_sequences(test_document_data)
test_lstm_pad_tokenized = pad_sequences(test_lstm_tokenized, maxlen=350)
test_lstm_prediction = model.predict(test_lstm_pad_tokenized)

# Convert LSTM predictions to binary format and save test results to a CSV file
test_lstm_binary_predictions = (test_lstm_prediction >= 0.3).astype(int)
test_lstm_predicted_labels = pd.DataFrame(test_lstm_binary_predictions, columns=training_data.columns[3:])
test_lstm_output = pd.concat([test_data[['ID']], test_lstm_predicted_labels], axis=1)
test_lstm_output.to_csv("./data/10699403-Task2-method-b.csv", header=False, index=False)