# **Task-1**

## Task-1-1 Data Preprocess

In [1]:
import numpy as np
import torch.nn.functional as F  # normalisation for the cosine similarity "https://pytorch.org/docs/stable/nn.functional.html"
import torch, os, random, csv, multiprocessing, time

import nltk  # Natural Language Toolkit
nltk.download('punkt')  # google colab warning "https://www.nltk.org/api/nltk.tokenize.punkt.html", which was helpful for the word2vec tokenization process

from torch.utils.data import Dataset  # try the software engineering style for dataset not pandas flow "https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset"
from nltk import word_tokenize  # nltk tokenization tool "https://www.nltk.org/api/nltk.tokenize.html"
from gensim.models import word2vec, Word2Vec  # word2vec model and api "https://radimrehurek.com/gensim/models/word2vec.html"; similarity api "https://rare-technologies.com/word2vec-tutorial/"
from transformers import BertTokenizer, BertModel  # hugging face:) for BERT tokenizer and model "https://huggingface.co/bert-base-cased"


# parent class
class BaseDataset(Dataset):
    def __init__(self, csv_file_path, max_length=512, return_text=True, cache_path=None):
        self.max_length = max_length  # the maximum length of samples
        self.return_text = return_text  # whether return ['sentences_text']
        self.cache_path = cache_path  # .pth :preprocessed data or weights (or actually any python object)

        if cache_path is None or not os.path.exists(cache_path):
            # load raw data
            with open(csv_file_path, 'r', encoding='utf-8') as f:
                csv_reader = csv.reader(f)
                lines = [row for row in csv_reader]

            data = []
            for line in lines[1:]:
                id_, title, plot_synopsis = line[:3]
                sentences_text = self.tokenize_fn(plot_synopsis)

                label = np.array([int(i) for i in line[3:]])  # labels
                data.append({
                    'ID': id_,
                    'title': title,
                    'sentences_text': sentences_text,
                    'label': label
                })

            self.data = data  # get the input data

    def preprocess_text(self, text):
        return np.zeros((len(text), 384))

    def tokenize_fn(self, text):
        text = text.replace('.', ' . ').lower()  # considering word + point('.') + word without blanks
        tokenized_text = word_tokenize(text)  # nltk tokenizer
        sentences_text = []
        # some special tags should be considered after observing the dataset
        for s in tokenized_text:
            if ',' in s and s != ',':
                s = '#NUMBER#'  # the format like word + comma(',') had been processed through the nltk tokenizer
            if s.isdigit():
                s = '#NUMBER#'
            if ':' in s and s != ':':
                s = '#TIME'
            sentences_text.append(s)
        return sentences_text

    def __len__(self):
        return len(self.data)


# child class
class TrainDataset(BaseDataset):
    def __init__(self, *args):
        super(TrainDataset, self).__init__(*args)

    def get_all_ids(self):
        return [self.data[idx]['ID'] for idx in range(len(self.data))]  # all the ['ID']

    def __getitem__(self, idx):
        # get data through the random idx, generated by pytorch itself
        id_ = self.data[idx]['ID']
        text = self.data[idx]['sentences_text']
        label = torch.tensor(self.data[idx]['label'])
        if 'sentences_emb' in self.data[idx].keys():
            emb = self.data[idx]['sentences_emb']
            emb = self.pad_sequence(emb)
            if self.return_text:
                return id_, text, emb, label
            else:
                return emb, label
        else:
            return id_, text, label


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Task-1-2 The Word2Vec Embedding

In [2]:
# word2vec
def build_word2vec_model(train_file_name):
    # load and write sentences to a file
    dataset = TrainDataset(train_file_name)
    with open('sentences.txt', 'w') as f:
        for i in range(len(dataset)):
            f.write(' '.join(dataset[i][1]) + '\n')  # 0:id_, 1:text, 2:label

    # build and save model
    all_sentences = list(word2vec.LineSentence('sentences.txt'))
    model = Word2Vec(all_sentences, vector_size=384, min_count=5, window=5, sg=0, workers=multiprocessing.cpu_count())  # CBOW: context->target
    model.save('word2vec.model')

    # check model
    model = word2vec.Word2Vec.load('word2vec.model')
    word2vec_vocab_key = list(model.wv.key_to_index.keys())  # unique words
    print(len(word2vec_vocab_key))  # how many unique words in the corpus(vocabulary)


# build word embeddings
t0 = time.time()
build_word2vec_model('./data/Training-dataset.csv')
t1 = time.time()
print('Time for generating Word2Vector Embeddings: %.2f ms'%(1000*(t1-t0)))


39828
Time for generating Word2Vector Embeddings: 75991.77 ms


## Task-1-3 Run and Save Results of Word2Vec and BERT

In [3]:
def read_data(csv_file_path):
    # load and parse raw data
    with open(csv_file_path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        lines = [row for row in csv_reader]

    data = []
    for line in lines:
        id_, word1, word2, *rest = line  ###### [Warning]: Whether the test data had the labels ######
        data.append((id_, word1, word2))
    return data  # get the input data

def write_data(results, save_file_path):
    # save predictions to a file
    with open(save_file_path, 'w') as f:
        for res in results:
            id_, y_pred = res
            f.write(','.join([id_, str(y_pred)]) + '\n')

# run evaluation for word2vec model
def get_similarity_word2vec(csv_file_path, save_file_path):
    # load word2vector model
    word2vec_model = word2vec.Word2Vec.load('word2vec.model')

    data = read_data(csv_file_path)
    results = []

    t0 = time.time()
    for id_, word1, word2 in data:
        try:
            sim = word2vec_model.wv.similarity(word1, word2)  # FROM word2vec.Word2Vec.load('word2vec.model')
        except:
            sim = 0.5  # if the word not in the corpus(vocabulary)
        results.append((id_, sim))
    t1 = time.time()
    print('Test time of Word2Vec model: %.2f ms'%(1000*(t1-t0)))  # ms since it was actually fast

    write_data(results, save_file_path)  # results

# run evaluation for BERT model
def get_similarity_bert(csv_file_path, save_file_path):
    # load bert embeddings
    model_name = 'bert-base-cased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    def get_embedding(sentence):
        tokens = tokenizer.tokenize(sentence)  # split the word into multiple parts(tokens)
        tokens = ['[CLS]'] + tokens + ['[SEP]']  # BERT
        input_ids = tokenizer.convert_tokens_to_ids(tokens) # get the indices(BUT not the embedding weight)
        input_ids = torch.tensor([input_ids])

        outputs = model(input_ids)  # pass the sequence to catch temporal information
        embedding = outputs[0]  # hidden states
        embedding = embedding[:, 1:-1, :]  # drop the [cls] and [sep]
        embedding = embedding.mean(1).squeeze()  # a word could be actually further divided (pretrained=pre+train+ed)

        return F.normalize(embedding, p=2, dim=0)  # normalization considering the cosine similarity

    data = read_data(csv_file_path)
    results = []

    t0 = time.time()
    for id_, word1, word2 in data:
        emb1 = get_embedding(word1)
        emb2 = get_embedding(word2)
        sim = float((emb1 * emb2).sum())  # compute the cosine similarity
        results.append((id_, sim))
    t1 = time.time()
    print('Test time of BERT model: %.2f ms'%(1000*(t1-t0)))  # ms since it was actually fast

    write_data(results, save_file_path)  # results


###### [Warning]: Validation & Test ######

if __name__ == '__main__':
    get_similarity_word2vec(
        'data/Task-1-test-dataset1.csv',
        'data/10879229-Task1-method-b.csv'
    )

    get_similarity_bert(
        'data/Task-1-test-dataset1.csv',
        'data/10879229-Task1-method-c.csv'
    )


Test time of Word2Vec model: 1.85 ms


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Test time of BERT model: 5791.48 ms
