In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip3 install tokenizers wandb sentencepiece



In [None]:
!pip3 install transformers huggingface-hub



In [None]:
!pip3 install datasets



In [None]:
!pip3 install beir



In [None]:
import os

os.chdir("drive/")
os.chdir('My Drive')
os.chdir('Research')
os.chdir('AIRetriever')

In [None]:
# OUTPUT_DIR = './outputs/'
# if not os.path.exists(OUTPUT_DIR):
#     os.makedirs(OUTPUT_DIR)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Sep 21 07:10:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    _wandb_kernel='bluehills'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/mdeberta-v3-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=50 # [0, 50, 100]
    epochs=3
    encoder_lr=1e-5 #2e-5
    decoder_lr=1e-5 #2e-5
    min_lr=5e-7
    eps=5e-7
    betas=(0.9, 0.999)
    batch_size=32
    fc_dropout=0.15
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    eval_steps=5000
    seed=43
    train=True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [None]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    import wandb
    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        # wandb.login(key=secret_value_0)
        print('login to wandb')
        wandb.login()
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='BEIR_SBERT',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

login to wandb


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbluehills[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
from math import sqrt
import shutil
import string
import pickle
import random
import joblib
import itertools
import logging
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import torch.cuda.amp as amp

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification

from sentence_transformers import losses, models, SentenceTransformer
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.train import TrainRetriever
import pathlib, os
import logging

import datasets
import huggingface_hub
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats

%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 2.4.1+cu121
tokenizers.__version__: 0.19.1
transformers.__version__: 4.44.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [None]:
# ====================================================
# Utils
# ====================================================

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [None]:
#### Download nfcorpus.zip dataset and unzip the dataset
dataset = "nfcorpus"

url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = "datasets"
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where nfcorpus has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_path).load(split="train")
#### Please Note not all datasets contain a dev split, comment out the line if such the case
dev_corpus, dev_queries, dev_qrels = GenericDataLoader(data_path).load(split="dev")

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

## Model

In [None]:
#### Provide any sentence-transformers or HF model
model_name = CFG.model

word_embedding_model = models.Transformer(model_name, max_seq_length=CFG.max_len)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#### Or provide pretrained sentence-transformer model
# model = SentenceTransformer("msmarco-distilbert-base-v3")

retriever = TrainRetriever(model=model, batch_size=CFG.batch_size)

In [None]:
#### Prepare training samples
train_samples = retriever.load_train(corpus, queries, qrels)
train_dataloader = retriever.prepare_train(train_samples, shuffle=True)

#### Training SBERT with cosine-product
train_loss = losses.MultipleNegativesRankingLoss(model=retriever.model)
#### training SBERT with dot-product
# train_loss = losses.MultipleNegativesRankingLoss(model=retriever.model, similarity_fct=util.dot_score)

Adding Input Examples:   0%|          | 0/81 [00:00<?, ?it/s]

In [None]:
#### Prepare dev evaluator
ir_evaluator = retriever.load_ir_evaluator(dev_corpus, dev_queries, dev_qrels)

#### If no dev set is present from above use dummy evaluator
# ir_evaluator = retriever.load_dummy_evaluator()

#### Provide model save path
model_save_path = os.path.join("output", "{}-v1-{}".format(model_name, dataset))
os.makedirs(model_save_path, exist_ok=True)

In [None]:
#### Configure Train params
num_epochs = CFG.epochs
evaluation_steps = CFG.eval_steps

warmup_steps = int(len(train_samples) * num_epochs / retriever.batch_size * 0.1)

In [None]:
retriever.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=ir_evaluator,
    epochs=num_epochs,
    output_path=model_save_path,
    warmup_steps=warmup_steps,
    evaluation_steps=evaluation_steps,
    use_amp=True
)



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
3456,2.8964,No log,0.138889,0.216049,0.265432,0.333333,0.138889,0.119342,0.119136,0.112963,0.004584,0.014514,0.02247,0.042039,0.120334,0.192817,0.062752,0.080247,0.166667,0.216049,0.287037,0.080247,0.089506,0.093827,0.087654,0.001625,0.007571,0.014594,0.027516,0.088564,0.137043,0.042158
5000,2.6423,No log,0.123457,0.243827,0.280864,0.376543,0.123457,0.139918,0.141358,0.134568,0.004805,0.018988,0.028687,0.055032,0.139787,0.199786,0.079532,0.092593,0.16358,0.20679,0.277778,0.092593,0.096708,0.097531,0.099074,0.002639,0.008652,0.017397,0.033532,0.099777,0.141493,0.05716
6912,2.4891,No log,0.169753,0.268519,0.314815,0.404321,0.169753,0.156379,0.152469,0.146914,0.010266,0.025079,0.036968,0.064671,0.160049,0.23495,0.094787,0.12037,0.191358,0.243827,0.302469,0.12037,0.118313,0.119136,0.117284,0.00401,0.010772,0.024281,0.042643,0.120961,0.170132,0.070416
10000,2.3134,No log,0.200617,0.299383,0.358025,0.404321,0.200617,0.177984,0.179012,0.162037,0.012636,0.025999,0.045882,0.075324,0.17882,0.260771,0.108199,0.12963,0.234568,0.271605,0.32716,0.12963,0.139918,0.139506,0.132407,0.004894,0.020183,0.028755,0.04933,0.136475,0.186949,0.088424
10368,2.3134,No log,0.197531,0.311728,0.367284,0.407407,0.197531,0.179012,0.182716,0.159877,0.012537,0.028172,0.046169,0.074644,0.177109,0.260449,0.108922,0.141975,0.234568,0.268519,0.330247,0.141975,0.13786,0.139506,0.130864,0.005321,0.019846,0.029084,0.049245,0.136844,0.195902,0.088159


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
!pip3 install huggingface-hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("BlackBeenie/mdeberta-v3-base-sbert", private=True)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

'https://huggingface.co/BlackBeenie/mdeberta-v3-base-sbert/commit/af35951dfda1b836b4ae8c71a5fa80fb94066954'