# Base pre-train and fine-tune models for SubtaskB

## Setup

In [None]:
# mount google drive to save models later
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Clone git repo containing helper scripts

`pat` is a personal access token in order to clone the private repo

Create a personal access token here https://github.com/settings/tokens 

In [None]:
pat = ''
with open('/content/drive/MyDrive/pat.txt', 'r') as f:
    pat = f.read().rstrip()
!git clone https://{pat}@github.com/agneknie/com4520DarwinProject.git

# install requirements
%cd com4520DarwinProject
!git checkout framework
!pip install -r requirements.txt
import site
site.main()

Cloning into 'com4520DarwinProject'...
remote: Enumerating objects: 7302, done.[K
remote: Counting objects: 100% (137/137), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 7302 (delta 52), reused 116 (delta 45), pack-reused 7165[K
Receiving objects: 100% (7302/7302), 40.18 MiB | 17.63 MiB/s, done.
Resolving deltas: 100% (711/711), done.
/content/com4520DarwinProject
Branch 'framework' set up to track remote branch 'framework' from 'origin'.
Switched to a new branch 'framework'
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import sys
import numpy as np
import random
import torch

from sentence_transformers import SentenceTransformer

sys.path.append( '/content/com4520DarwinProject/src' )
from data.pre_train_dataset import make_pre_train_dataset
from models.pre_train_model import make_pre_train_model
from data.extract_idioms import extract_idioms
from evaluation.evaluate import get_dev_results, format_results, save_eval_output
from models.fine_tune_model import fine_tune_model
from evaluation.evaluate import get_dev_results, format_results, save_eval_output


## Parameters

In [None]:

base_path = os.path.join(os.getcwd())
subtask_b_dataset_path = os.path.join(base_path, 'data', 'datasets', 'SemEval_2022_Task2_SubTaskB')
drive_models_path = '/content/drive/Shareddrives/COM4520 Darwin Project - Team Quebec /Models/'

languages = ['EN', 'PT']
tokenize_idioms = True
seed = 4

In [None]:
def set_seed(seed: int):
    """
    Modified from : https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_utils.py
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available

    ## From https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.benchmark = False

    ## Might want to use the following, but set CUBLAS_WORKSPACE_CONFIG=:16:8
    # try : 
    #   torch.use_deterministic_algorithms(True)
    # except AttributeError: 
    #   torch.set_deterministic( True )
    
set_seed(seed)

## Fine-tune model

The pre-train model created above is fine-tuned on the training data supplied in this https://github.com/H-TayyarMadabushi/SemEval_2022_Task2-idiomaticity repo

Multiple negatives ranking loss and triplet loss are used (same as this paper https://aclanthology.org/2022.semeval-1.26/ which was the 1st place fine-tune team).
This is different from the baseline which uses cosine similarity loss only.

In [None]:

num_epochs = 4

model_path = drive_models_path + 'base_model_tokenized'
output_path = os.path.join(drive_models_path, 'fine_tune', 'dataset_baseline', 'enhancement_none', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')

model = fine_tune_model(
    model_path,
    output_path,
    subtask_b_dataset_path,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=num_epochs
    )

FileNotFoundError: ignored

In [None]:

dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

format_results(results)

In [None]:
results_file = os.path.join(base_path, 'eval.results.csv')
save_eval_output(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms)

## Fine-tune model with dataset transform example

Fine tune a model that passes in the MWE along with the sentence as a second input.

In [None]:

model_path = drive_models_path + 'pre_train/tokenized/epochs_4'
# output_path = os.path.join(drive_models_path, 'fine_tune', 'dataset_baseline', 'enhancement_add_mwe', 'tokenized' if tokenize_idioms else 'not_tokenized', 'epochs_' + str(num_epochs), 'seed_' + str(seed))
output_path = os.path.join(base_path, 'models', 'add_mwe')
train_file = os.path.join(subtask_b_dataset_path, 'TrainData', 'train_data.csv')


def add_MWE(sentences, MWEs):
    return [sentence + '[SEP]' + mwe for (sentence, mwe) in zip(sentences, MWEs)]

model = fine_tune_model(
    model_path,
    output_path,
    train_file,
    tokenize_idioms=tokenize_idioms,
    languages=languages,
    num_epochs=1,
    transform=add_MWE
    )

In [None]:
dev_eval_path = os.path.join(subtask_b_dataset_path, 'EvaluationData')
results_file = os.path.join(base_path, 'dev.results.csv')

results = get_dev_results(model, dev_eval_path, results_file, ['fine_tune'], languages, tokenize_idioms=tokenize_idioms, transform=add_MWE)

format_results(results)