# Base pre-train and fine-tune models for SubtaskB

## Setup

In [None]:
# mount google drive to save models later
from google.colab import drive
drive.mount('/content/drive')

Clone git repo containing helper scripts

`pat` is a personal access token in order to clone the private repo

Create a personal access token here https://github.com/settings/tokens 

In [1]:
pat = ''
# with open('/content/drive/MyDrive/pat.txt', 'r') as f:
    # pat = f.read().rstrip()
!git clone https://{pat}@github.com/agneknie/com4520DarwinProject.git

# install requirements
%cd com4520DarwinProject
!pip install -r requirements.txt
import site
site.main()

Cloning into 'darwin-test'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 15 (delta 0), reused 15 (delta 0), pack-reused 0[K
Unpacking objects: 100% (15/15), done.
/content/darwin-test


In [1]:
import os
import sys
import numpy as np
import random
import torch

from sentence_transformers import SentenceTransformer

sys.path.append( '/content/com4520DarwinProject/src' )
from data.pre_train_dataset import make_pre_train_dataset
from models.pre_train_model import make_pre_train_model
from data.extract_idioms import extract_idioms
from evaluation.evaluate import get_dev_results, format_results, save_eval_output
from models.fine_tune_model import fine_tune_model
from evaluation.evaluate import get_dev_results, format_results, save_eval_output


ModuleNotFoundError: No module named 'transformers'

## Parameters

In [None]:

base_path = os.path.join(os.getcwd())
subtask_b_dataset_path = os.path.join(base_path, 'data', 'datasets', 'SemEval_2022_Task2_SubTaskB')
drive_models_path = '/content/drive/Shareddrives/COM4520 Darwin Project - Team Quebec /Models/'

languages = ['EN', 'PT']
tokenize_idioms = True
seed = 4

In [None]:
def set_seed(seed: int):
    """
    Modified from : https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_utils.py
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available

    ## From https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.benchmark = False

    ## Might want to use the following, but set CUBLAS_WORKSPACE_CONFIG=:16:8
    # try : 
    #   torch.use_deterministic_algorithms(True)
    # except AttributeError: 
    #   torch.set_deterministic( True )
    
set_seed(seed)

## Pre-train Model

The model is trained in the same way as the baseline

#### Create a dataset containing generic STS data

In [3]:
data_path = os.path.join(base_path, 'data')

train_data, dev_data, test_data = make_pre_train_dataset(data_path, languages=languages)

Downloading builder script:   0%|          | 0.00/4.24k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

Downloading and preparing dataset assin2/default to /root/.cache/huggingface/datasets/assin2/default/1.0.0/8467f7acbda82f62ab960ca869dc1e96350e0e103a1ef7eaa43bbee530b80061...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset assin2 downloaded and prepared to /root/.cache/huggingface/datasets/assin2/default/1.0.0/8467f7acbda82f62ab960ca869dc1e96350e0e103a1ef7eaa43bbee530b80061. Subsequent calls will reuse this data.




#### Train the model on this generic data

In [4]:

num_epochs = 4

model_path = os.path.join(drive_models_path, 'pre_train', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs))
tmp_path = os.path.join(base_path, 'tmp')

if tokenize_idioms:
    idioms = extract_idioms(subtask_b_dataset_path, languages=languages)
else:
    idioms = None

# by default uses the 'bert-base-multilingual-cased' model as a starting point
model = make_pre_train_model(
    train_data, 
    dev_data, 
    model_path,
    tmp_path, 
    tokenize_idioms=idioms,
    num_epochs=num_epochs
    )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Some weights of the model checkpoint at /content/darwin-test/tmp were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /content/darwin-test/tmp and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.we

Warmup-steps: 1226


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3063 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3063 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3063 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3063 [00:00<?, ?it/s]

In [None]:
# Load model from google drive, not necessary if the model has just been trained
model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
model = SentenceTransformer(model_path)

#### Evaluate model on dev set

In [None]:

dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['pre_train'], languages, tokenize_idioms=tokenize_idioms)

format_results(results)

In [None]:
results_file = os.path.join(base_path, 'eval.results.csv')
save_eval_output(model, dev_eval_path, results_file, ['pre_train'], languages, tokenize_idioms=tokenize_idioms)

## Fine-tune model

The pre-train model created above is fine-tuned on the training data supplied in this https://github.com/H-TayyarMadabushi/SemEval_2022_Task2-idiomaticity repo

Multiple negatives ranking loss and triplet loss are used (same as this paper https://aclanthology.org/2022.semeval-1.26/ which was the 1st place fine-tune team).
This is different from the baseline which uses cosine similarity loss only.

In [None]:

num_epochs = 4

model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
output_path = os.path.join(drive_models_path, 'dataset_baseline', 'enhancement_none', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')

model = fine_tune_model(
    model_path,
    output_path,
    train_file,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=num_epochs
    )

In [None]:

dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

format_results(results)

In [None]:
results_file = os.path.join(base_path, 'eval.results.csv')
save_eval_output(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

## Fine-tune model with dataset transform example

Fine tune a model that passes in the MWE along with the sentence as a second input.

In [None]:

model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
# output_path = os.path.join(drive_models_path, 'dataset_baseline', 'enhancement_add_mwe', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
output_path = os.path.join(base_path, 'models', 'add_mwe')
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')


def add_MWE(sentences, MWEs):
    return [sentence + '[SEP]' + mwe for (sentence, mwe) in zip(sentences, MWEs)]

model = fine_tune_model(
    model_path,
    output_path,
    train_file,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=1,
    transform=add_MWE
    )

In [None]:
dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms, transform=add_MWE)

format_results(results)