TODO

[github](https://github.com/abhishekkrthakur/commonlit-pairwise-model)

In [None]:
from IPython.display import clear_output, Image
!pip install transformers
clear_output()

In [None]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
import transformers
from numpy import random

path_tr = '/content/drive/MyDrive/CommonLit/input/train.csv'
path_test = '/content/drive/MyDrive/CommonLit/input/test.csv'
path_sub = '/content/drive/MyDrive/CommonLit/input/sample_submission.csv'

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def create_folds(data, num_splits):
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop("bins", axis=1)
    return data


df = pd.read_csv(path_tr)
df = create_folds(df, num_splits=5)

In [None]:
import psutil


class CommonlitDataset:
    def __init__(self, excerpts, target_dict, error_dict, tokenizer, max_len, num_samples=None):
        self.excerpts = excerpts
        self.target_dict = target_dict
        self.error_dict = error_dict
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_samples = num_samples
        self.count = 0

    def __len__(self):
        if self.num_samples is None:
            return len(self.excerpts)
        return self.num_samples

    def __getitem__(self, item):
        if self.num_samples is not None:
            self.count += 1
            if self.count >= self.num_samples / psutil.cpu_count():
                self.count = 0
                random.shuffle(self.excerpts)

        text1 = str(self.excerpts[item][1])
        text2 = str(self.excerpts[item][0])
        target = [
            self.target_dict[text2],
            self.target_dict[text1],
        ]

        inputs1 = self.tokenizer(text1, max_length=self.max_len, padding="max_length", truncation=True)
        inputs2 = self.tokenizer(text2, max_length=self.max_len, padding="max_length", truncation=True)

        ids1 = inputs1["input_ids"]
        mask1 = inputs1["attention_mask"]

        ids2 = inputs2["input_ids"]
        mask2 = inputs2["attention_mask"]

        return {
            "ids1": torch.tensor(ids1, dtype=torch.long),
            "mask1": torch.tensor(mask1, dtype=torch.long),
            "ids2": torch.tensor(ids2, dtype=torch.long),
            "mask2": torch.tensor(mask2, dtype=torch.long),
            "targets": torch.tensor(target, dtype=torch.float),
        }

In [None]:
    args = parse_args()
    seed_everything(42)
    os.makedirs(args.output_folder, exist_ok=True)
    output_path = os.path.join(
        args.output_folder,
        f"{args.model.replace('/',':')}__fold_{args.fold}.bin",
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(args.model)
    df = pd.read_csv("../input/train_folds.csv")

    # base string is excerpt where target is 0 in the dataframe
    base_string = df.loc[df.target == 0, "excerpt"].values[0]

    # create dictionary out of excerpt and target columns from dataframe
    target_dict = dict(zip(df.excerpt.values.tolist(), df.target.values.tolist()))
    df_train = df[df.kfold != args.fold].reset_index(drop=True)
    df_valid = df[df.kfold == args.fold].reset_index(drop=True)
    training_pairs = list(itertools.combinations(df_train.excerpt.values.tolist(), 2))

    # randomize training_pairs
    random.shuffle(training_pairs)
    validation_pairs = [(base_string, k) for k in df_valid.excerpt.values.tolist()]