In [2]:
import csv
# Initialize empty lists to store the data
plots = []
categories_list = []

# Initialize an empty list to store the category names
categories_str_list = []

# Open the CSV file for reading
with open('/content/Training-dataset.csv', mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.reader(file)

    # Read the header row to get the category names
    header_row = next(csv_reader)
    categories_str_list = header_row[3:12]

    # Iterate through each row in the CSV file
    for row in csv_reader:
        # Extract the plot_synopsis and add it to the plots list
        plot_synopsis = row[2]
        plots.append(plot_synopsis)

        # Extract the categories (column indices 3 to 11) and add them as a list to the categories_list
        categories = [int(row[i]) for i in range(3, 12)]
        categories_list.append(categories)

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
def clean_plot(plot):
    # Remove all characters that are not alphabetic
    plot = re.sub(r'[^a-zA-Z ]', '', plot)

    return plot

# Initialize NLTK
nltk.download('punkt')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download the WordNet database

lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Remove stop words
stop_words = stopwords.words('english')

def process_plot(plot):
  cleaned_plot = clean_plot(plot)
  tokenized_plot = word_tokenize(cleaned_plot.lower())
  lemmatized_plot = lemmatize_tokens(tokenized_plot)
  filtered_plot = [word for word in lemmatized_plot if word not in stop_words]
  return filtered_plot

processed_plots = [process_plot(plot) for plot in plots]

# join list of words back into a single string
training_plots = [' '.join(tokens) for tokens in processed_plots]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# training data preparation
vectorizer = TfidfVectorizer()  # Initialize a TF-IDF Vectorizer
X = vectorizer.fit_transform(training_plots)  # Transform plots to TF-IDF feature vectors
Y = np.array(categories_list)  # Convert categories to a numpy array for labels

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)  # 20% of data for testing, 80% for training

# Train the multi-label SVM model
svm = SVC(probability=True)  # Initialize the Support Vector Classifier with probability estimates
multi_label_model = OneVsRestClassifier(svm)  # Wrap SVC in a OneVsRestClassifier for multi-label classification
multi_label_model.fit(X_train, Y_train)  # Fit the model to the training data

In [91]:
import csv

csv_file_path = '/content/Task-2-validation-dataset(1).csv'


validation_ids = []
validation_titles = []
validation_plots = []


with open(csv_file_path, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        validation_ids.append(row['ID'])
        validation_titles.append(row['title'])
        validation_plots.append(row['plot_synopsis'])

In [99]:
import csv

csv_file_path = '/content/Task-2-test-dataset1.csv'


test_ids = []
test_titles = []
test_plots = []


with open(csv_file_path, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        test_ids.append(row['ID'])
        test_titles.append(row['title'])
        test_plots.append(row['plot_synopsis'])

In [92]:
processed_validation_plots = [process_plot(plot) for plot in validation_plots]

# join list of words back into a single string
validation_plots_str = [' '.join(ready_plot) for ready_plot in processed_validation_plots]

In [93]:
converted_validation_plots = vectorizer.transform(validation_plots_str)
validation_predictions = multi_label_model.predict(converted_validation_plots)

In [94]:
import csv

output_file_path = '10749545-Task2-method-a-validation.csv'

# Open the file in write mode
with open(output_file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Iterate over zipped IDs and predictions
    for id, prediction in zip(validation_ids, validation_predictions):
        # Ensure that prediction is in list format and write to the file
        row = [id] + list(prediction)
        writer.writerow(row)

In [100]:
processed_test_plots = [process_plot(plot) for plot in test_plots]

# join list of words back into a single string
test_plots_str = [' '.join(plot) for plot in processed_test_plots]

In [101]:
converted_test_plots = vectorizer.transform(test_plots_str)
test_predictions = multi_label_model.predict(converted_test_plots)

In [102]:
import csv

output_file_path = '10749545-Task2-method-a.csv'

# Open the file in write mode
with open(output_file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Iterate over zipped IDs and predictions
    for id, prediction in zip(test_ids, test_predictions):
        # Ensure that prediction is in list format and write to the file
        row = [id] + list(prediction)
        writer.writerow(row)

In [95]:
%run '/content/task2_eval_script_student_version(1).py' /content/10749545-Task2-method-a-validation.csv '/content/Task-2-validation-dataset(1).csv'

Class level: 
Class  1 precision: 0.9091 recall: 0.0571
Class  2 precision: 0.8571 recall: 0.0972
Class  3 precision: 0.7586 recall: 0.0748
Class  4 precision: 0.0000 recall: 0.0000
Class  5 precision: 0.7409 recall: 0.6644
Class  6 precision: 0.4545 recall: 0.0211
Class  7 precision: 0.8525 recall: 0.1793
Class  8 precision: 0.0000 recall: 0.0000
Class  9 precision: 0.7019 recall: 0.4429
----------------------------
Movie (document) level: 
Precision: 0.4384
Recall: 0.3199


In [81]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# Initialize a tokenizer for text data with a vocabulary of 80000 most frequent words
tokenizer = Tokenizer(num_words=80000)
# Fit the tokenizer on the training plots to create a word index
tokenizer.fit_on_texts(training_plots)
# Convert the list of training plots into a list of sequences of integers
sequences = tokenizer.texts_to_sequences(training_plots)

import numpy as np
# Convert the category list into a numpy array for use in model training
categories_list_np = np.array(categories_list)

# Calculate the length of each plot to find the average plot
plot_lengths = [len(plot) for plot in training_plots]
average_plot_length = int(sum(plot_lengths) / len(plot_lengths))

# Pad the sequences so that all have the same length for model input
padded_sequences = pad_sequences(sequences, maxlen=average_plot_length)

# Define the model architecture using the Sequential API
model = Sequential()
# Add an Embedding layer to transform indices into dense vectors of a fixed size
model.add(Embedding(input_dim=80000, output_dim=128, input_length=average_plot_length))
# First LSTM layer
model.add(LSTM(64, return_sequences=True))
# Add an LSTM layer with 16 units
model.add(LSTM(64, return_sequences=False))
# Add a Dense output layer with a number of neurons equal to the number of categories, using sigmoid activation
model.add(Dense(categories_list_np.shape[1], activation='sigmoid'))

# Compile the model with binary crossentropy loss and adam optimizer, tracking accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create an EarlyStopping callback to stop training when the validation loss stops improving
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model on the padded sequences, using early stopping and a validation split of 20%
model.fit(padded_sequences, categories_list_np, batch_size=132, epochs=5, callbacks=[early_stopping], validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f1a2aad8ee0>

In [104]:
sequenced_validation = tokenizer.texts_to_sequences(validation_plots_str)
padded_validation= pad_sequences(sequenced_validation, maxlen=average_plot_length)
predict = model.predict(padded_validation)



In [105]:
def convert_predictions(predictions):
    binary_predictions = []
    for pred in predictions:
        if np.any(pred > 0.6):
            # If any probability is greater than 0.6, convert to 1, else 0
            binary_pred = np.where(pred > 0.6, 1, 0)
        else:
            # If no probability exceeds 0.6, take the index of the max value
            max_index = np.argmax(pred)
            binary_pred = np.zeros_like(pred)
            binary_pred[max_index] = 1
        binary_predictions.append(binary_pred.astype(int))  # Cast to integer
    return np.array(binary_predictions)

validation_predictions_deep = convert_predictions(predict)

In [106]:
import csv

# Path where you want to save the CSV file
output_file_path = '10749545-Task2-method-b-validation.csv'

# Open the file in write mode
with open(output_file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Iterate over zipped IDs and predictions
    for id, prediction in zip(validation_ids, validation_predictions_deep):
        # Ensure that prediction is in list format and write to the file
        row = [id] + list(prediction)
        writer.writerow(row)

In [108]:
sequenced_test = tokenizer.texts_to_sequences(test_plots_str)
padded_test= pad_sequences(sequenced_test, maxlen=average_plot_length)
predict = model.predict(padded_test)



In [110]:
test_predictions_deep = convert_predictions(predict)

output_file_path = '10749545-Task2-method-b.csv'

# Open the file in write mode
with open(output_file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Iterate over zipped IDs and predictions
    for id, prediction in zip(test_ids, test_predictions_deep):
        # Ensure that prediction is in list format and write to the file
        row = [id] + list(prediction)
        writer.writerow(row)

In [111]:
%run '/content/task2_eval_script_student_version(1).py' /content/10749545-Task2-method-b-validation.csv '/content/Task-2-validation-dataset(1).csv'

Class level: 
Class  1 precision: 0.0000 recall: 0.0000
Class  2 precision: 0.0000 recall: 0.0000
Class  3 precision: 0.0000 recall: 0.0000
Class  4 precision: 0.0000 recall: 0.0000
Class  5 precision: 0.4911 recall: 0.9966
Class  6 precision: 0.0000 recall: 0.0000
Class  7 precision: 0.0000 recall: 0.0000
Class  8 precision: 0.0000 recall: 0.0000
Class  9 precision: 0.4444 recall: 0.0095
----------------------------
Movie (document) level: 
Precision: 0.4907
Recall: 0.2727
