# Task 2

In [19]:
import pandas as pd
import numpy as np

# preprocessing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# model 1
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

## import datasets

In [20]:
dataset='val'

In [21]:
data = pd.read_csv('data/Training-dataset.csv')
if dataset=='val':
    val = pd.read_csv('data/Task-2-validation-dataset.csv')
else:
    val = pd.read_csv('data/Task-2-test-dataset1.csv') # but is actually test but will be too much work

In [22]:
data.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


In [23]:
val.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,Shattered Hearts,"In the enchanting city of Verona, Italy, renow...",0,0,0,0,1,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,Point Blank,Walker works with his friend Mal Reese to stea...,0,1,1,0,1,1,0,0,1
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,Le charme discret de la bourgeoisie,The film consists of several thematically link...,1,0,1,0,0,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,A Gentleman's Dignity,A Gentleman's Dignity is about the careers and...,0,0,0,0,0,0,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,Carmen: A Hip Hopera,"Carmen Brown (Beyoncé) is a seductive, aspirin...",0,0,0,0,1,0,0,0,0


## Tokenisation and Preprocessing

In [24]:
# processing text

def processText(text):
    # case folding
    text = text.lower()

    # Stop words removal
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    # words = text.split() #nus: same here
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    
    return text

In [25]:
# tokenization

def tokenize(docs, maxlen):
    global tokenizer
    # tokenize the text data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)

    # convert text to sequences
    sequences = tokenizer.texts_to_sequences(docs)
    
    # padding sequences to a fixed length
    padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

    return padded_sequences

In [26]:
# vocab info
def printVocabInfo():
    global vocab_size
    vocab_size = len(tokenizer.word_index) + 1
    # print(f'Vocabulary size: {vocab_size}')
    
    # word_index = tokenizer.word_index
    # print(word_index)

In [27]:
def encodeLabels(df):
    selected_columns = df.iloc[:, 3:]
    labels = selected_columns.values
    return labels

In [28]:
def preprocessing(df, maxlen):
    documents = df['plot_synopsis'].apply(processText)
    padded_sequences = tokenize(documents, maxlen)
    labels= encodeLabels(df)
    
    return padded_sequences, labels

## Classification Model 1: LSTM

### Model Creation

In [29]:
# split data
def splitData(trainDF, valDF, maxlen):
    global X_train, y_train, X_test
    
    X_train, y_train = preprocessing(trainDF, maxlen)
    printVocabInfo()
    
    documents = valDF['plot_synopsis'].apply(processText)
    X_test = tokenize(documents, 40)

In [30]:
# create model

def createModel(hp):
    model = Sequential()

    # add an embedding layer
    model.add(Embedding(input_dim=vocab_size, output_dim=hp['embedding_dim'], input_length=hp['maxlen']))

    # add lstm layers
    model.add(LSTM(units=hp['units'], activation='tanh', return_sequences=True, dropout=hp['dropout']))
    model.add(LSTM(units=hp['units'], activation='tanh', return_sequences=True, dropout=hp['dropout']))
    model.add(LSTM(units=hp['units'], activation='tanh'))

    # add an output layer
    model.add(Dense(units=9, activation='sigmoid'))

    # compile the model
    optimizer = Adam(learning_rate=hp['learning_rate'], clipvalue=hp['clipvalue'])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [31]:
# train model
def trainModel(model, hp):
    model.fit(X_train, y_train, epochs=hp['epochs'], batch_size=hp['batchsize'])

In [32]:
def training(hp):
    model = createModel(hp)
    trainModel(model, hp)
    
    return model

### Hyper-parameter-tuning

In [33]:
hp = {
    'maxlen': 40,
    'embedding_dim':256,
    'units': 64,
    'epochs': 30,
    'batchsize': 80,
    'dropout': 0.3,
    'learning_rate': 0.01,
    'clipvalue': 0.5
}

In [34]:
splitData(data, val, hp['maxlen'])

In [36]:
model = training(hp)

In [None]:
predicted_probabilities = model.predict(X_test)
threshold = 0.49
predicted_labels = (predicted_probabilities > threshold).astype(int)



## Results

In [None]:
results1 = val[['ID']].copy()

label_columns = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']
labels_df = pd.DataFrame(predicted_labels, columns=label_columns)

results1 = pd.concat([results1, labels_df], axis=1)

In [None]:
results1.head()

Unnamed: 0,ID,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,0,1,1,0,1,0,0,0,1
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,0,0,0,0,0,0,1,0,0
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,0,0,1,1,1,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,1,1,1,0,1,0,0,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,0,0,0,0,0,0,0,1,0


In [None]:
if dataset=='val':
    results1.to_csv('data/10697685-Task2-method-b-validation.csv', header=False, index=False)
else:
    results1.to_csv('data/10697685-Task2-method-b.csv', index=False)