# The validation data for method b (LSTM)

In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text data
def clean_text(input_text):
    tokens = word_tokenize(input_text)
    lowercased_tokens = []
    for word in tokens:
        lowercased_tokens.append(word.lower())

    alphabetic_words = []
    for word in lowercased_tokens:
        if word.isalpha():
            alphabetic_words.append(word)

    common_words = set(stopwords.words('english'))
    meaningful_words = []
    for word in alphabetic_words:
        if word not in common_words:
            meaningful_words.append(word)

    stemmer = SnowballStemmer('english')
    stemmed_words = []
    for word in meaningful_words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    processed_text = ' '.join(stemmed_words)
    return processed_text


training_data = pd.read_csv('./data/Training-dataset.csv')
training_data['cleaned_plot'] = training_data['plot_synopsis'].apply(clean_text)
text_tokenizer = Tokenizer(num_words=10000)
text_tokenizer.fit_on_texts(training_data['cleaned_plot'])
train_sequences = text_tokenizer.texts_to_sequences(training_data['cleaned_plot'])
train_data_padded = pad_sequences(train_sequences, maxlen=200)
target_labels = training_data.iloc[:, 3:12].values

lstm_model = Sequential()
lstm_model.add(Embedding(10000, 128, input_length=200))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(target_labels.shape[1], activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(train_data_padded, target_labels, epochs=5, batch_size=32)

# Process the dataset
dataset_path ='./data/Task-2-test-dataset1.csv'
#dataset_path = './data/Task-2-validation-dataset.csv'
data = pd.read_csv(dataset_path)
data['cleaned_plot'] = data['plot_synopsis'].apply(clean_text)
sequences = text_tokenizer.texts_to_sequences(data['cleaned_plot'])
data_padded = pad_sequences(sequences, maxlen=200)

predicted_probs = lstm_model.predict(data_padded)
predicted_labels = (predicted_probs > 0.29).astype(int) #best value for the threshold
prediction_df = pd.DataFrame(predicted_labels, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
prediction_df.insert(0, 'ID', data['ID'])
prediction_df.to_csv('10879201-Task2-method-b.csv', index=False, header=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# The validation data for method a (SVM)

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text data
def clean_text(input_text):
    tokens = word_tokenize(input_text)
    lowercased_tokens = []
    for word in tokens:
        lowercased_tokens.append(word.lower())

    alphabetic_words = []
    for word in lowercased_tokens:
        if word.isalpha():
            alphabetic_words.append(word)

    common_words = set(stopwords.words('english'))
    meaningful_words = []
    for word in alphabetic_words:
        if word not in common_words:
            meaningful_words.append(word)

    stemmer = SnowballStemmer('english')
    stemmed_words = []
    for word in meaningful_words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    processed_text = ' '.join(stemmed_words)
    return processed_text

training_data = pd.read_csv('./data/Training-dataset.csv')
training_data['cleaned_plot'] = training_data['plot_synopsis'].apply(clean_text)

tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(training_data['cleaned_plot'])

target_labels = training_data.iloc[:, 3:12].values
svm_model = OneVsRestClassifier(SVC(kernel='linear', probability=True))
svm_model.fit(X_train_tfidf, target_labels)

#dataset_path = './data/Task-2-validation-dataset.csv'
dataset_path = './data/Task-2-test-dataset1.csv'
data = pd.read_csv(dataset_path)
data['cleaned_plot'] = data['plot_synopsis'].apply(clean_text)
X_tfidf = tfidf_vectorizer.transform(data['cleaned_plot'])

predicted_labels = svm_model.predict(X_tfidf)
prediction_df = pd.DataFrame(predicted_labels, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
prediction_df.insert(0, 'ID', data['ID'])
prediction_df.to_csv('10879201-Task2-method-a.csv', index=False, header=False)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
