<a href="https://colab.research.google.com/github/aaalexlit/omdena_climate_change_challenge_notebooks/blob/main/finetune_multivers_on_climate_fever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetune pretrained MultiVerS model on modified Climate-FEVER dataset

Following [these instructions](https://github.com/dwadden/multivers/blob/main/doc/training.md)

The idea is to re-use the code from the instruction but replace the content of `covidfact` folder with the modified CLIMATE-FEVER 

In [None]:
%%capture
!git clone https://github.com/dwadden/multivers.git
!pip install virtualenv
!virtualenv multivers
!source /kaggle/working/multivers/bin/activate; pip install -r /kaggle/working/multivers/requirements.txt

In [2]:
!python /kaggle/working/multivers/script/get_checkpoint.py longformer_large_science


--2023-04-25 04:31:05--  https://scifact.s3.us-west-2.amazonaws.com/longchecker/latest/checkpoints/longformer_large_science.ckpt
Resolving scifact.s3.us-west-2.amazonaws.com (scifact.s3.us-west-2.amazonaws.com)... 3.5.80.127, 52.218.229.145, 52.92.194.114, ...
Connecting to scifact.s3.us-west-2.amazonaws.com (scifact.s3.us-west-2.amazonaws.com)|3.5.80.127|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1646843693 (1.5G) [binary/octet-stream]
Saving to: ‘checkpoints/longformer_large_science.ckpt’


2023-04-25 04:31:56 (31.1 MB/s) - ‘checkpoints/longformer_large_science.ckpt’ saved [1646843693/1646843693]



In [3]:
!mv /kaggle/working/checkpoints /kaggle/working/multivers/checkpoints

In [4]:
%%capture
!pip install gdown

In [5]:
# Download the train data.

!gdown https://drive.google.com/uc?id=1uiG3RCA366nbLX9NoEbYX5BAi2NFXiKG
!tar -xf data_train.tar.gz
!rm data_train.tar.gz

!mv /kaggle/working/data_train /kaggle/working/multivers/data_train

Downloading...
From: https://drive.google.com/uc?id=1uiG3RCA366nbLX9NoEbYX5BAi2NFXiKG
To: /kaggle/working/data_train.tar.gz
100%|████████████████████████████████████████| 417M/417M [00:05<00:00, 71.7MB/s]


In [6]:
%%capture
!pip install datasets
!pip install jsonlines

In [None]:
from datasets import load_dataset
import pandas as pd
import jsonlines

ds_orig = load_dataset("climate_fever", split='test')
df = ds_orig.to_pandas()

mv_label_dict = {0: 'SUPPORT', 1: 'CONTRADICT'}

docs = {}
doc_id = 0
indexed_evidence_sents = set()

counter = 0

with jsonlines.open('claims_comb_train.jsonl', 'w') as claims_train_writer, \
    jsonlines.open('claims_comb_test.jsonl', 'w') as claims_test_writer, \
    jsonlines.open('claims_comb_dev.jsonl', 'w') as claims_dev_writer:
    for claim_row in df.to_dict('records'):
        counter += 1
        evidences = claim_row['evidences']
        cur_evidence_dict = {}
        cur_claim_doc_ids = set()
        for evidence in evidences:
            evidence_label = evidence['evidence_label']
            evidence_article = evidence['article']
            evidence_sent = evidence['evidence']
            if evidence_article in docs:
                doc = docs.get(evidence_article)
            else:
                doc = {"doc_id": doc_id,
                       "title": evidence_article,
                       "abstract": []}
                docs[evidence_article] = doc
                doc_id += 1
            cur_doc_id = doc["doc_id"]
            cur_claim_doc_ids.add(cur_doc_id)
            abstract = doc["abstract"]
            # Don't allow abstracts longer than 4096 otherwise longformer won't work
            if len(''.join(abstract).split()) > 2900:
                continue
            if not evidence_sent in indexed_evidence_sents:
                abstract.append(evidence_sent)
                indexed_evidence_sents.add(evidence_sent)
                sent_ind = len(abstract) - 1
            else:
                sent_ind = abstract.index(evidence_sent)

            # add only evidences to the documents that are not NEI
            if evidence_label != 2:
                curr_sent = {
                    "sentences": [sent_ind],
                    "label": mv_label_dict[evidence_label]
                }
                if f"{cur_doc_id}" in cur_evidence_dict:
                    exist_sents = cur_evidence_dict[f"{cur_doc_id}"]
                    for s in exist_sents:
                        if s['label'] == curr_sent['label']:
                            s['sentences'].append(sent_ind)
                else:
                    cur_evidence_dict[f"{cur_doc_id}"] = [curr_sent]

        claim_doc = {
            'id': int(claim_row['claim_id']),
            'claim': claim_row['claim'],
            'cited_doc_ids': list(cur_claim_doc_ids),
            'evidence': cur_evidence_dict
        }
        if counter % 4 == 0:
            claims_test_writer.write(claim_doc)
        else:
            claims_train_writer.write(claim_doc)
        if counter % 20 == 0: 
            claims_dev_writer.write(claim_doc)


with jsonlines.open('corpus_comb_for_training.jsonl', 'w') as corpus_writer:
    for doc in docs.values():
        if doc["abstract"]:
            corpus_writer.write(doc)


In [None]:
!rm /kaggle/working/multivers/data_train/target/covidfact/*
!mv /kaggle/working/claims_comb_test.jsonl /kaggle/working/multivers/data_train/target/covidfact/claims_test.jsonl
!mv /kaggle/working/claims_comb_train.jsonl /kaggle/working/multivers/data_train/target/covidfact/claims_train.jsonl
!mv /kaggle/working/claims_comb_dev.jsonl /kaggle/working/multivers/data_train/target/covidfact/claims_dev.jsonl
!mv /kaggle/working/corpus_comb_for_training.jsonl /kaggle/working/multivers/data_train/target/covidfact/corpus.jsonl

manually  change line 59 of /content/multivers/script/train_target.py
to become `        "multivers/multivers/train.py",
`

In [13]:
%%writefile /kaggle/working/multivers/script/train_target.py
"""
Kickoff training on target datasets.

NOTE: Training right now doesn't work with multiple GPU's and DDP. This is known issue;
see for instance https://lightning.ai/forums/t/gradient-checkpointing-ddp-nan/398/7.
"""


import argparse
import subprocess


def get_args():
    help_gpus = """GPU's used for training.
    If a single int, specifies the number of GPU's.
    If a comma-separated list, specifies the specific device ID's.
    For a single specific device, write it as `[device-num],`
    """

    parser = argparse.ArgumentParser("Kick off model training.")
    parser.add_argument(
        "--dataset",
        type=str,
        help="The dataset to train on.",
        choices=["scifact_20", "scifact_10", "healthver", "covidfact"],
    )
    parser.add_argument("--gpus", type=str, help=help_gpus)
    parser.add_argument(
        "--gradient_checkpointing",
        action="store_true",
        help="Turning this on decreases memory usage at the cost of slower training",
    )
    args = parser.parse_args()

    return args


def main():
    args = get_args()
    gpus = args.gpus

    # Deal with case of specific devides
    if "," in gpus:
        n_gpus = len([x for x in gpus.split(",") if x])
    else:
        n_gpus = int(gpus)

    if n_gpus not in [1, 2, 4, 8]:
        raise ValueError("The number of GPU's must be a power of 2.")

    epochs = 5
    workers_per_gpu = 4  # Number of CPU's per gpu.
    effective_batch_size = 8  # Desired effective batch size.
    accumulate_grad_batches = effective_batch_size // n_gpus
    num_workers = workers_per_gpu * n_gpus

    cmd = [
        "python",
        "multivers/multivers/train.py",
        "--result_dir",
        "/kaggle/tmp/checkpoints_user",
        "--datasets",
        args.dataset,
        "--starting_checkpoint",
        "/kaggle/tmp/checkpoints/last.ckpt",
        "--experiment_name",
        args.dataset,
        "--num_workers",
        num_workers,
        "--gpus",
        gpus,
        "--accumulate_grad_batches",
        accumulate_grad_batches,
        "--lr",
        "1e-5",
        "--precision",
        16,
        "--max_epochs",
        epochs,
        "--scheduler_total_epochs",
        epochs,
        "--train_batch_size",
        1,
        "--eval_batch_size",
        2,
        "--encoder_name",
        "longformer-large-science",
        "--no_reweight_labels",
    ]

    # If training on more than 1 gpu, use DDP accelerator.
    if n_gpus > 1:
        cmd.extend(["--accelerator", "ddp"])

    # Turn on gradient checkpointing if requested.
    if args.gradient_checkpointing:
        cmd.append("--gradient_checkpointing")

    subprocess.call(map(str, cmd))


if __name__ == "__main__":
    main()


Overwriting /kaggle/working/multivers/script/train_target.py


# To start training

In [38]:
# !source /content/multivers/bin/activate; python /content/multivers/script/train_target.py \
#         --dataset=covidfact --gpus=1 --gradient_checkpointing

Global seed set to 76

fatal: not a git repository (or any of the parent directories): .git

GPU available: True, used: True

TPU available: None, using: 0 TPU cores

Using native 16bit precision.





  | Name                 | Type            | Params

---------------------------------------------------------

0 | encoder              | LongformerModel | 434 M 

1 | dropout              | Dropout         | 0     

2 | label_classifier     | FeedForward     | 1.1 M 

3 | rationale_classifier | FeedForward     | 2.1 M 

4 | metrics              | ModuleDict      | 0     

---------------------------------------------------------

437 M     Trainable params

0         Non-trainable params

437 M     Total params

1,751.052 Total estimated model params size (MB)



Epoch 0:  97% 3520/3624 [2:26:22<04:19,  2.49s/it, loss=3.12, v_num=fact]

Validating: 0it [00:00, ?it/s][A

Validating:   0% 0/110 [00:00<?, ?it/s][A

Epoch 0:  98% 3540/3624 [2:27:08<03:29,  2.49s/it, loss=3.12, v_num=fact

# to continue training
manually change this line https://github.com/dwadden/multivers/blob/main/script/train_target.py#L65 to start from available checkpoint
in /content/checkpoints_user/covidfact/checkpoint/last.ckpt

and lower number of epochs by 1 on ine 51 of the same file

In [8]:
# !source /content/multivers/bin/activate; python /content/multivers/script/train_target.py \
#         --dataset=covidfact --gpus=1 --gradient_checkpointing

Global seed set to 76

fatal: not a git repository (or any of the parent directories): .git

Downloading: 100% 803/803 [00:00<00:00, 628kB/s]

Downloading: 100% 899k/899k [00:00<00:00, 5.65MB/s]

Downloading: 100% 456k/456k [00:00<00:00, 3.67MB/s]

Downloading: 100% 1.36M/1.36M [00:00<00:00, 12.8MB/s]

Downloading: 100% 1.74G/1.74G [00:45<00:00, 37.9MB/s]

GPU available: True, used: True

TPU available: None, using: 0 TPU cores

Using native 16bit precision.





  | Name                 | Type            | Params

---------------------------------------------------------

0 | encoder              | LongformerModel | 434 M 

1 | dropout              | Dropout         | 0     

2 | label_classifier     | FeedForward     | 1.1 M 

3 | rationale_classifier | FeedForward     | 2.1 M 

4 | metrics              | ModuleDict      | 0     

---------------------------------------------------------

437 M     Trainable params

0         Non-trainable params

437 M     Total params

1,751.052 To

# continue training on kaggle

## download the last checkpoint

In [16]:
!gsutil cp gs://cc-evidences-data/multivers_checkpoints/covidfact/checkpoint/last.ckpt /kaggle/tmp/checkpoints/last.ckpt

Copying gs://cc-evidences-data/multivers_checkpoints/covidfact/checkpoint/last.ckpt...
| [1 files][  4.9 GiB/  4.9 GiB]   69.6 MiB/s                                   
Operation completed over 1 objects/4.9 GiB.                                      


manually change this line https://github.com/dwadden/multivers/blob/main/script/train_target.py#L65 to start from available checkpoint in /content/checkpoints_user/covidfact/checkpoint/last.ckpt

and lower number of epochs by 1 on ine 51 of the same file

In [32]:
# !source /kaggle/working/multivers/bin/activate; python /kaggle/working/multivers/script/train_target.py \
#         --dataset=covidfact --gpus=1 --gradient_checkpointing

Global seed set to 76
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name                 | Type            | Params
---------------------------------------------------------
0 | encoder              | LongformerModel | 434 M 
1 | dropout              | Dropout         | 0     
2 | label_classifier     | FeedForward     | 1.1 M 
3 | rationale_classifier | FeedForward     | 2.1 M 
4 | metrics              | ModuleDict      | 0     
---------------------------------------------------------
437 M     Trainable params
0         Non-trainable params
437 M     Total params
1,751.052 Total estimated model params size (MB)
Epoch 0:  97%|████▊| 3515/3624 [2:16:54<04:14,  2.34s/it, loss=1.17, v_num=fact]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|                                       |

In [15]:
# !source /kaggle/working/multivers/bin/activate; python /kaggle/working/multivers/script/train_target.py \
#         --dataset=covidfact --gpus=1 --gradient_checkpointing

Global seed set to 76
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name                 | Type            | Params
---------------------------------------------------------
0 | encoder              | LongformerModel | 434 M 
1 | dropout              | Dropout         | 0     
2 | label_classifier     | FeedForward     | 1.1 M 
3 | rationale_classifier | FeedForward     | 2.1 M 
4 | metrics              | ModuleDict      | 0     
---------------------------------------------------------
437 M     Trainable params
0         Non-trainable params
437 M     Total params
1,751.052 Total estimated model params size (MB)
Epoch 0:  97%|████▊| 3515/3624 [1:35:43<02:58,  1.63s/it, loss=2.05, v_num=fact]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|                                       |

In [None]:
! . /kaggle/working/multivers/bin/activate; python /kaggle/working/multivers/script/train_target.py \
        --dataset=covidfact --gpus=1 --gradient_checkpointing

# copy newly trained checkpoints to gc storage

In [16]:
%%writefile  /kaggle/working/cc-evidence-service-c1ecd810f649.json
GCP key goes here


Writing /kaggle/working/cc-evidence-service-c1ecd810f649.json


In [17]:
import glob
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/kaggle/working/cc-evidence-service-c1ecd810f649.json"

from google.cloud import storage
storage_client = storage.Client(project='cc-evidence-service')

def upload_local_directory_to_gcs(local_path, dest_bucket_name, gcs_path):
    bucket = storage_client.get_bucket(dest_bucket_name)
    assert os.path.isdir(local_path)
    for local_file in glob.glob(local_path + '/**'):
        if not os.path.isfile(local_file):
            upload_local_directory_to_gcs(local_file, bucket, gcs_path + "/" + os.path.basename(local_file))
        else:
            remote_path = os.path.join(gcs_path, local_file[1 + len(local_path):])
            blob = bucket.blob(remote_path)
            blob.upload_from_filename(local_file)

In [23]:
upload_local_directory_to_gcs("/kaggle/tmp/checkpoints_user","cc-evidences-data", "multivers_checkpoints")