# Base pre-train and fine-tune models for SubtaskB

## Setup

In [None]:
# mount google drive to save models later
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Clone git repo containing helper scripts

`pat` is a personal access token in order to clone the private repo

Create a personal access token here https://github.com/settings/tokens 

In [None]:
pat = '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'
# with open('/content/drive/MyDrive/pat.txt', 'r') as f:
    # pat = f.read().rstrip()
!git clone https://{pat}@github.com/agneknie/com4520DarwinProject.git

# install requirements
%cd com4520DarwinProject
!git checkout framework
!pip install -r requirements.txt
import site
site.main()

Cloning into 'com4520DarwinProject'...
remote: Enumerating objects: 7512, done.[K
remote: Counting objects: 100% (347/347), done.[K
remote: Compressing objects: 100% (171/171), done.[K
remote: Total 7512 (delta 179), reused 311 (delta 163), pack-reused 7165[K
Receiving objects: 100% (7512/7512), 89.44 MiB | 12.00 MiB/s, done.
Resolving deltas: 100% (838/838), done.
Updating files: 100% (6914/6914), done.
/content/com4520DarwinProject/com4520DarwinProject
Updating files: 100% (6986/6986), done.
Branch 'framework' set up to track remote branch 'framework' from 'origin'.
Switched to a new branch 'framework'
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import sys
import numpy as np
import random
import torch

from sentence_transformers import SentenceTransformer

sys.path.append( '/content/com4520DarwinProject/src' )
from data.pre_train_dataset import make_pre_train_dataset
from models.pre_train_model import make_pre_train_model
from data.extract_idioms import extract_idioms
from evaluation.evaluate import get_dev_results, format_results, save_eval_output
from models.fine_tune_model import fine_tune_model
from evaluation.evaluate import get_dev_results, format_results, save_eval_output


## Parameters

In [None]:

base_path = os.path.join(os.getcwd())
subtask_b_dataset_path = os.path.join(base_path, 'data', 'datasets', 'SemEval_2022_Task2_SubTaskB')
drive_models_path = '/content/drive/Shareddrives/COM4520 Darwin Project - Team Quebec /Models/'

languages = ['EN', 'PT']
tokenize_idioms = True
seed = 4

In [None]:
def set_seed(seed: int):
    """
    Modified from : https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_utils.py
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available

    ## From https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.benchmark = False

    ## Might want to use the following, but set CUBLAS_WORKSPACE_CONFIG=:16:8
    # try : 
    #   torch.use_deterministic_algorithms(True)
    # except AttributeError: 
    #   torch.set_deterministic( True )
    
set_seed(seed)

## Pre-train Model

The model is trained in the same way as the baseline

#### Create a dataset containing generic STS data

In [None]:
data_path = os.path.join(base_path, 'data')

train_data, dev_data, test_data = make_pre_train_dataset(data_path, languages=languages)




In [None]:
#INSERT AUGMENTATION REQUIREMENTS
!pip install numpy requests nlpaug
!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
!pip install simpletransformers>=0.61.10
!pip install nltk>=3.4.5


# from nlpaug.util.file.download import DownloadUtil
# !DownloadUtil.download_word2vec(dest_dir='.') # Download word2vec model
# !DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.') # Download GloVe model
# !DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # Download fasttext model

!pip install gensim>=4.1.2

%load_ext autoreload
%autoreload 2
import importlib
import os
os.environ["MODEL_DIR"] = '../model'
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from data.idiom_dataset import load_dataset
from data.util import write_csv
import pandas as pd
#Need to upload the gold data csv from the drive into google collab !!!!
gold_data = pd.read_csv(os.path.join(subtask_b_dataset_path, 'TrainData', 'gold_dataset.csv'))
semeval_data = pd.read_csv(os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv'))
#concat with Sem eval data
pd.concat([gold_data, semeval_data]).to_csv(os.path.join(subtask_b_dataset_path, 'TrainData', 'concat_dataframe.csv'))

In [None]:
from nlpaug.util.file.download import DownloadUtil

#If too many requests causes error, comment out, download and then upload the file manually (a pain I know) 
# DownloadUtil.download_word2vec(dest_dir='.') # Download word2vec model
DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.') # Download GloVe model
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # Download fasttext model

In [None]:
import os
os.environ["MODEL_DIR"] = '../model'
import sklearn.datasets
import re

import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

# Load sample data
train_idf_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
train_x = train_idf_data.data

# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')


In [None]:
#MODELS
#based of common spelling mistakes
def augment_spelling(sentences):
  aug = naw.SpellingAug()
  sentences = aug.augment(sentences, n=1) 
  return sentences

#WORD EMBEDDING AUGMENTOR
#insert word based on word2vec
def insert_random_w2v(sentences):
  aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
    action="insert")
  sentences = aug.augment(sentences)
  return sentences

#replace word based on word2vec
def replace_random_w2v(sentences):
  aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
    action="substitute")
  sentences = aug.augment(sentences)
  return sentences

#Insert word based on TF-IDF score
def insert_word_tfidf(sentences):
  aug = naw.TfIdfAug(
    model_path='.',
    action="insert")
  sentences = aug.augment(sentences)
  return sentences

#Insert word based on TF-IDF score
def replace_word_tfidf(sentences):
  aug = naw.TfIdfAug(
    model_path='.',
    action="substitute")
  sentences = aug.augment(sentences)
  return sentences


In [None]:
concat = pd.read_csv(os.path.join(subtask_b_dataset_path, 'TrainData', 'concat_dataframe.csv'))

In [None]:
#Uncoment the function you want to run!
# #spelling mistake augmentation
#GOLD 
# aug_spelling = os.path.join(subtask_b_dataset_path, 'TrainData', 'aug_spelling_gold.csv')
# header, data=load_dataset(gold_data, transform=augment_spelling, languages=['EN'])
# write_csv([header]+data, aug_spelling)


#gold+sem
aug_spelling = os.path.join(subtask_b_dataset_path, 'TrainData', 'aug_spelling_gold+sem.csv')
header, data=load_dataset(concat, transform=augment_spelling, languages=['EN'])
write_csv([header]+data, aug_spelling)

Wrote /content/com4520DarwinProject/data/datasets/SemEval_2022_Task2_SubTaskB/TrainData/aug_spelling_gold+sem.csv


In [None]:
#uncomment the function you want to run!!
#insert random word into sentences based on word2vec model 
#gold
# insert_w2v = os.path.join(subtask_b_dataset_path, 'TrainData', 'insert_w2v_gold.csv')
# header, data=load_dataset(gold_data, transform=insert_random_w2v, languages=['EN'])
# write_csv([header]+data, insert_w2v)

#gold+sem
insert_w2v = os.path.join(subtask_b_dataset_path, 'TrainData', 'insert_w2v_gold+sem.csv')
header, data=load_dataset(concat, transform=insert_random_w2v, languages=['EN'])
write_csv([header]+data, insert_w2v)

Wrote /content/com4520DarwinProject/data/datasets/SemEval_2022_Task2_SubTaskB/TrainData/insert_w2v.csv


In [None]:
#uncomment the function you want to run!!
#gold
# replace_w2v = os.path.join(subtask_b_dataset_path, 'TrainData', 'replace_w2v_gold.csv')
# header, data=load_dataset(gold_data, transform=replace_random_w2v, languages=['EN'])
# write_csv([header]+data, insert_w2v)

#gold+sem
replace_w2v = os.path.join(subtask_b_dataset_path, 'TrainData', 'replace_w2v_gold+sem.csv')
header, data=load_dataset(concat, transform=replace_random_w2v, languages=['EN'])
write_csv([header]+data, insert_w2v)

In [None]:
#uncomment the function you want to run!!
#gold
# insert_tfidf = os.path.join(subtask_b_dataset_path, 'TrainData', 'insert_tfidf_gold.csv')
# header, data=load_dataset(gold_data, transform=insert_word_tfidf, languages=['EN'])
# write_csv([header]+data, insert_tfidf)

#gold+sem
insert_tfidf = os.path.join(subtask_b_dataset_path, 'TrainData', 'insert_tfidf_gold+sem.csv')
header, data=load_dataset(concat, transform=insert_word_tfidf, languages=['EN'])
write_csv([header]+data, insert_tfidf)

Wrote /content/com4520DarwinProject/data/datasets/SemEval_2022_Task2_SubTaskB/TrainData/insert_tfidf_gold+sem.csv


In [None]:
#uncomment the function you want to run!!
#gold
# replace_tfidf = os.path.join(subtask_b_dataset_path, 'TrainData', 'replace_tfidf_gold.csv')
# header, data=load_dataset(gold_data, transform=replace_word_tfidf, languages=['EN'])
# write_csv([header]+data, replace_tfidf)

#gold+sem
replace_tfidf = os.path.join(subtask_b_dataset_path, 'TrainData', 'replace_tfidf_gold+sem.csv')
header, data=load_dataset(concat, transform=replace_word_tfidf, languages=['EN'])
write_csv([header]+data, replace_tfidf)

Wrote /content/com4520DarwinProject/com4520DarwinProject/data/datasets/SemEval_2022_Task2_SubTaskB/TrainData/replace_tfidf_gold+sem.csv


In [None]:
#Concat the csvs
def concat_csv(file1, file2, endfile):
  csv1 = pd.read_csv(os.path.join(subtask_b_dataset_path, 'TrainData', file1))
  csv2 = pd.read_csv(os.path.join(subtask_b_dataset_path, 'TrainData', file2))
  concat = pd.concat([csv1, csv2]).to_csv(os.path.join(subtask_b_dataset_path, 'TrainData', endfile))
#uncomment the function you want to run!!

#gold
# gold_spelling = concat_csv('gold_dataset.csv', 'aug_spelling_gold.csv', 'gold_spelling.csv' )
# gold_insert_w2v
# gold_replace_w2v
# gold_insert_tfidf =concat_csv('gold_dataset.csv', 'insert_tfidf_gold.csv', 'gold_insert_tfidf.csv' )
# gold_replace_tfidf =concat_csv('gold_dataset.csv', 'replace_tfidf_gold.csv', 'gold_replace_tfidf.csv' )

# gold_and_sem_spelling = concat_csv('concat_dataframe.csv', 'aug_spelling_gold+sem.csv', 'gold+sem_spelling.csv' )
# gold+sem_insert_w2v
# gold+sem_replace_w2v
# gold_and_sem_insert_tfidf = concat_csv('concat_dataframe.csv', 'insert_tfidf_gold+sem.csv', 'gold+sem_insert_tfidf.csv' )
gold_and_sem_replace_tfidf = concat_csv('concat_dataframe.csv', 'replace_tfidf_gold+sem.csv', 'gold+sem_replace_tfidf.csv' )

# sem_spelling
# sem_insert_w2v
# sem_replace_w2v
# sem_insert_tfidf
# sem_replace_tfidf

#### Train the model on this generic data

In [None]:

num_epochs = 4

model_path = os.path.join(drive_models_path, 'pre_train', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs))
tmp_path = os.path.join(base_path, 'tmp')

if tokenize_idioms:
    idioms = extract_idioms(subtask_b_dataset_path, languages=languages)
else:
    idioms = None

# by default uses the 'bert-base-multilingual-cased' model as a starting point
model = make_pre_train_model(
    train_data, 
    dev_data, 
    model_path,
    tmp_path, 
    tokenize_idioms=idioms,
    num_epochs=num_epochs
    )

In [None]:
# Load model from google drive, not necessary if the model has just been trained
model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
model = SentenceTransformer(model_path)

#### Evaluate model on dev set

In [None]:

dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['pre_train'], languages, tokenize_idioms=tokenize_idioms)

format_results(results)

In [None]:
results_file = os.path.join(base_path, 'eval.results.csv')
save_eval_output(model, dev_eval_path, results_file, ['pre_train'], languages, tokenize_idioms=tokenize_idioms)

## Fine-tune model

The pre-train model created above is fine-tuned on the training data supplied in this https://github.com/H-TayyarMadabushi/SemEval_2022_Task2-idiomaticity repo

Multiple negatives ranking loss and triplet loss are used (same as this paper https://aclanthology.org/2022.semeval-1.26/ which was the 1st place fine-tune team).
This is different from the baseline which uses cosine similarity loss only.

In [None]:

num_epochs = 4

model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
output_path = os.path.join(drive_models_path, 'fine_tune', 'dataset_baseline', 'enhancement_none', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')
#comment the one you want

# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold_spelling.csv')
# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold_insert_tfidf.csv')
# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold_replace_tfidf.csv')

# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold+sem_spelling.csv')
# train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold+sem_insert_tfidf.csv')
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'gold+sem_replace_tfidf.csv')


model = fine_tune_model(
    model_path,
    output_path,
    train_file,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=num_epochs
    )

First positives sample:  The noble college IC ##X ##G ##6 Sera ##mpo ##re , 32 ##W Ski ##ba pale ##ttes au ##dite ##e Dr ##ac ##ut , is his best memorial . 
 The blue Com ##mun ##ion college spots Sera ##mpo ##re , Mord ##echa ##i O ##oo ##ops hundreds of OH ##A , is his best F ##41 ##F .
Num positives samples:  8654
First triplet sample:  Bud ea ##ves ##dro ##pping absence blood friend ( L ##OR ##P ##U Mint ##o ) is strongly impressed 9 ##C ##Z OR ##L ##H ##Z ##RC ##H ##Z AI ##R ##LA ##J ##Z of a free constitution - - not , however , AV ##Q ##NT IQ ##RS than I 5 ##GP ##P ##F ##J ##S ##Q . 
 Wu ##arch ##ive Hi ##nge noble friend ( Kar ##oki Mint ##o ) app ##rox ##imation D ##V ##M impressed with the format ##ing gu ##aran ##tor a UD ##36 constitution - - not , however , more gen ##ral than I am . 
 Dis ##tri ##but ##e my 30 ##am awarded friend ( Lord Mint ##o ) Please strongly impressed with the 291 ##2 of a free Per - - N ##2 ##R ##4 ##E , warm ##ed , im ##pea ##ch tren ##tu than I am

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1898 [00:00<?, ?it/s]

In [None]:

dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

format_results(results)

First dev sample:  Are these inter ##ruption ##s of the good life a necessary condition of the IDhighlifeID ? 
 Are these inter ##ruption ##s of the good life a necessary condition of the expensive lifestyle ?
Num dev samples:  2181


Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Wrote /content/com4520DarwinProject/dev.results.csv


Unnamed: 0,Settings,Languages,Spearman Rank ALL,Spearman Rank Idiom Data,Spearman Rank STS Data
0,pre_train,EN,Did Not Attempt,Did Not Attempt,Did Not Attempt
1,pre_train,PT,Did Not Attempt,Did Not Attempt,Did Not Attempt
2,pre_train,"EN,PT",Did Not Attempt,Did Not Attempt,Did Not Attempt
3,fine_tune,EN,0.810566,0.427529,0.659453
4,fine_tune,PT,0.646658,0.590227,0.653128
5,fine_tune,"EN,PT",0.752339,0.542215,0.727002


In [None]:
results_file = os.path.join(base_path, 'eval.results.csv')
save_eval_output(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

First eval sample:  The Secretary of State [UNK] s office became aware of the error this week while fu ##lf ##illing a W ##DS ##U public records request seeking the IDmailinglistID , the total number of registered voters over 65 and the total number of program en ##rolle ##es , Ar ##do ##in said . 
 The Secretary of State [UNK] s office became aware of the error this week while fu ##lf ##illing a W ##DS ##U public records request seeking the address list , the total number of registered voters over 65 and the total number of program en ##rolle ##es , Ar ##do ##in said .
Num eval samples:  2262


Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Wrote /content/com4520DarwinProject/eval.results.csv


## Fine-tune model with dataset transform example

Fine tune a model that passes in the MWE along with the sentence as a second input.

In [None]:

model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
# output_path = os.path.join(drive_models_path, 'fine_tune', 'dataset_baseline', 'enhancement_add_mwe', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
output_path = os.path.join(base_path, 'models', 'add_mwe')
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')


def add_MWE(sentences, MWEs):
    return [sentence + '[SEP]' + mwe for (sentence, mwe) in zip(sentences, MWEs)]

model = fine_tune_model(
    model_path,
    output_path,
    train_file,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=1,
    transform=add_MWE
    )

First positives sample:  Despite having the riches to af ##ford the IDhighlifeID , PS ##G captain Mar ##quin ##hos is still in touch with his past life before becoming a multi - million ##aire footballer . [SEP] high life 
 Despite having the riches to af ##ford the expensive lifestyle , PS ##G captain Mar ##quin ##hos is still in touch with his past life before becoming a multi - million ##aire footballer . [SEP] expensive lifestyle
Num positives samples:  3643
First triplet sample:  So Aaron faced the same brutal ra ##cism other Black players of the era experienced , especially as the sl ##ug ##ger approached Ruth [UNK] s IDhomerunID record . [SEP] home run 
 So Aaron faced the same brutal ra ##cism other Black players of the era experienced , especially as the sl ##ug ##ger approached Ruth [UNK] s baseball run record . [SEP] baseball run 
 So Aaron faced the same brutal ra ##cism other Black players of the era experienced , especially as the sl ##ug ##ger approached Ruth [UNK] s hou

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/733 [00:00<?, ?it/s]

In [None]:
dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms, transform=add_MWE)

format_results(results)

First dev sample:  Are these inter ##ruption ##s of the good life a necessary condition of the IDhighlifeID ? [SEP] high life 
 Are these inter ##ruption ##s of the good life a necessary condition of the expensive lifestyle ? [SEP]
Num dev samples:  2181


Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Wrote /content/com4520DarwinProject/dev.results.csv


Unnamed: 0,Settings,Languages,Spearman Rank ALL,Spearman Rank Idiom Data,Spearman Rank STS Data
0,pre_train,EN,Did Not Attempt,Did Not Attempt,Did Not Attempt
1,pre_train,PT,Did Not Attempt,Did Not Attempt,Did Not Attempt
2,pre_train,"EN,PT",Did Not Attempt,Did Not Attempt,Did Not Attempt
3,fine_tune,EN,0.702638,0.484462,0.76798
4,fine_tune,PT,0.567763,0.532967,0.842109
5,fine_tune,"EN,PT",0.644262,0.509719,0.850289
