In [None]:
import pandas as pd
import nlp
import numpy as np
import torch
from datasets import Dataset
import json
import os

In [None]:
features_stance = nlp.Features(
    {
        "id": nlp.Value("string"),
        "rumor": nlp.Value("string"),
        "label": nlp.Value("string"),
        "timeline": nlp.Sequence(nlp.Value("string")),
        "evidence": nlp.Sequence(nlp.Value("string")),
    }
)

In [None]:
# train: training set for production
# dev: significantly smaller dataset for rapid training
# test: testset outside the training set used to measure model accuracy

datasets = {
    "cw-train": nlp.load_dataset("csv", name="cw-train", data_files="data/checkworthy/english_train.tsv", delimiter="\t"),
    "cw-dev": nlp.load_dataset("csv", name="cw-dev", data_files="data/checkworthy/english_dev.tsv", delimiter="\t"),
    "st-train": nlp.load_dataset("json", name="st-train", data_files="data/stance/cleaned_train_extra.json"),
    "st-dev": nlp.load_dataset("json", name="st-dev", data_files="data/stance/cleaned_dev_extra.json")
}
    #TODO: Add the other datasets including the other one about stance detection
    #TODO: Figure out which datasets is which, by that, I mean which one is training, which one is testing, etc. 

In [None]:

#pds = pd.read_json("data/stance/cleaned_train.json", lines=True)
#pds["timeline"] = pds["timeline"].apply(lambda timeline: [x[2] for x in timeline])
#pds["evidence"] = pds["evidence"].apply(lambda evidence: [x[2] for x in evidence])
#pds["timeline"] = pds["timeline"][0]
#pds["evidence"] = pds["evidence"][0]
#d = [{"id": } for i in pds.to_dict]
#tasts = pds.to_dict(orient="records")
#tasta = json.dumps(tasts)
#with open("data/stance/cleaned_train_extra.json", "w") as f:
#    for element in tasts:
#        json.dump(element, f)
#        f.write("\n")
#pds.to_json("data/stance/cleaned_train_extra.json", orient="index")
#tast = Dataset.from_pandas(pds)
#test = pds.iloc[:1]["timeline"][0]

#tests = datasets["cw-train"]
def clean_to_extra_clean(f_loc, fname, end_fname):
    df = pd.read_json(os.path.join(f_loc, fname), lines=True)
    df["timeline"] = df["timeline"].apply(lambda timeline: [x[2] for x in timeline])
    df["evidence"] = df["evidence"].apply(lambda evidence: [x[2] for x in evidence])
    json_format = df.to_dict(orient="records")
    with open(os.path.join(f_loc, end_fname), "w") as f:
        for element in json_format:
            json.dump(element, f)
            f.write("\n")
    return
#The following function generates files, these are already created, no need to do it again
#clean_to_extra_clean("data/stance", "cleaned_train.json", "cleaned_train_extra.json")
#clean_to_extra_clean("data/stance", "cleaned_dev.json", "cleaned_dev_extra.json")


In [None]:
for task_name, dataset in datasets.items():
    print(task_name)
    print(datasets[task_name]["train"][1])
    print()

In [None]:
#https://colab.research.google.com/github/zphang/zphang.github.io/blob/master/files/notebooks/Multi_task_Training_with_Transformers_NLP.ipynb#scrollTo=aVX5hFlzmLka
import transformers
import torch.nn as nn
class MultiTask(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        super().__init__(transformers.PretrainedConfig())
        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    @classmethod
    def create(cls, model_name, model_type_dict, model_conf_dict):
        shared_encoder = None
        taskmodels_dict = {}

        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                model_name,
                config=model_conf_dict[task_name]
            )

            if shared_encoder is None:
                shared_encoder = getattr(model, cls.get_encoder_attr_name(model))
            else:
                setattr(model, cls.get_encoder_attr_name(model), shared_encoder)
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)
        
    @classmethod
    def get_encoder_attr_name(cls, model):
        model_class_name = model.__class__.__name__
        if model_class_name.startswith("Bert"):
            return "bert"
        elif model_class_name.startswith("Roberta"):
            return "roberta"

        else:
            raise KeyError(f"Add support for new model {model_class_name}")
        
    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)
#test = MultiTask(transformers.BertModel.from_pretrained("bert-base-uncased"), datasets)

In [None]:
model_name = "roberta-base"
#from transformers import BertTokenizer, BertForSequenceClassification
#"cw-train": transformers.
model_type_dict={
        "cw-train": transformers.AutoModelForSequenceClassification,
        "cw-dev": transformers.AutoModelForSequenceClassification,
    
        "st-train": transformers.AutoModelForSequenceClassification,
        "st-dev": transformers.AutoModelForSequenceClassification
    }
        #"test": transformers.AutoModelForTokenClassification

model_config_dict={
        "cw-train": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
        "cw-dev": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
    
        "st-train": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
        "st-dev": transformers.AutoConfig.from_pretrained(model_name, num_labels=3)
    }
        #"test": transformers.AutoConfig.from_pretrained(model_name, num_labels=5)
#TODO: Add the other datasets in these two dictionaries. Should prolly use the AutoModelForTokenClassification for the other datasets
multitask_model = MultiTask.create(
    model_name,
    model_type_dict,
    model_config_dict
    
)

In [None]:
if model_name.startswith("roberta"):
    print(multitask_model.encoder.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["cw-train"].roberta.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["cw-dev"].roberta.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["st-train"].roberta.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["st-dev"].roberta.embeddings.word_embeddings.weight.data_ptr())
else:
    print("Exercise for the reader: add a check for other model architectures =)")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [None]:
from sklearn import preprocessing
max_length = 128

def convert_cw_train_features(batch):
    inputs = batch['Text']
    features = tokenizer.batch_encode_plus(
        inputs,
        max_length=max_length,
        pad_to_max_length=True
    )
    le = preprocessing.LabelEncoder()
    targets = le.fit_transform(batch["class_label"])
    features["labels"] = targets
    #features["labels"] = batch["class_label"]
    return features

def convert_cw_dev_features(batch):
    inputs = batch['Text']
    features = tokenizer.batch_encode_plus(
        inputs,
        max_length=max_length,
        pad_to_max_length=True
    )
    le = preprocessing.LabelEncoder()
    targets = le.fit_transform(batch["class_label"])
    features["labels"] = targets
    #features["labels"] = batch["class_label"]
    return features

def convert_st_train_features(batch):
    inputs = batch['Rumor']
    features = tokenizer.batch_encode_plus(
        inputs,
        max_length=max_length,
        pad_to_max_length=True
    )
    le = preprocessing.LabelEncoder()
    targets = le.fit_transform(batch["label"])
    features["labels"] = targets
    #features["labels"] = batch["class_label"]
    return features

def convert_st_dev_features(batch):
    inputs = batch['Rumor']
    features = tokenizer.batch_encode_plus(
        inputs,
        max_length=max_length,
        pad_to_max_length=True
    )
    le = preprocessing.LabelEncoder()
    targets = le.fit_transform(batch["label"])
    features["labels"] = targets
    #features["labels"] = batch["class_label"]
    return features

#TODO: Create funcs for the other datasets as well, do prolly have to do some finicky shit with TokenClassifications contra SequenceClassification

convert_func_dict = {
    "cw-train": convert_cw_train_features,
    "cw-dev": convert_cw_dev_features,
    
    "st-train": convert_st_train_features,
    "st-dev": convert_st_dev_features
}

In [None]:
columns_dict = {
    "cw-train": ['input_ids', 'attention_mask', 'labels'],
    "cw-dev": ['input_ids', 'attention_mask', 'labels'],
    
    "st-train": ['input_ids', 'attention_mask', 'labels'],
    "st-dev": ['input_ids', 'attention_mask', 'labels'],
}   
    #"cw-dev": ['Sentence_id', 'Text', 'class_label'],
    #"test": ['id', 'rumor', 'label', 'timeline', 'evidence']
#TODO: Add shit here as well, above, not below, or perhaps below as well, who knows. 

features_dict = {}
for task_name, dataset in datasets.items():
    features_dict[task_name] = {}
    for phase, phase_dataset in dataset.items():
        features_dict[task_name][phase] = phase_dataset.map(
            convert_func_dict[task_name],
            batched=True,
            load_from_cache_file=False,
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))
        #Perhaps something needs to be done here?
        #pytorch iterate dataloader TypeError: new(): invalid data type 'str'
        #https://stackoverflow.com/questions/72003569/pytorch-typeerror-new-invalid-data-type-str-when-converting-nested-list
        #https://github.com/huggingface/datasets/issues/469
        #https://stackoverflow.com/questions/77241228/typeerror-invalid-data-type-str-when-using-dataloader-to-train-scibert
        features_dict[task_name][phase].set_format(
            type='torch', 
            columns=columns_dict[task_name],
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))

In [None]:
from transformers import DefaultDataCollator

class NLPDataCollator(DefaultDataCollator):
    def collate_batch(self, features):
        first = features[0]
        if isinstance(first, dict):
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
                else:
                    labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)
                batch = {"labels": labels}
            for k ,v in first.items():
                if k != "labels":
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            return transformers.DefaultDataCollator().collate_batch(features)
    

class StrIgnoreDevice(str):
    def to(self, device):
        return self


In [None]:
class DataLoaderWithTaskname:
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)
    

    #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
    #https://stackoverflow.com/questions/77241228/typeerror-invalid-data-type-str-when-using-dataloader-to-train-scibert
    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch

class MultitaskDataloader:
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset)
            for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())
    
    def __iter__(self):
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])


In [None]:
from transformers import Trainer
#from transformers.data.data_collator import torch_default_data_collator
from transformers.trainer_utils import is_tf_available
from transformers.trainer import get_dataloader_sampler
#from accelerate.state import is_tf_available
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from torch.utils.data.dataloader import DataLoader
class MultitaskTrainer(Trainer):
    def get_single_train_dataloader(self, task_name, train_dataset):
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = RandomSampler(train_dataset)
        
        #train_sampler = (
        #    RandomSampler(train_dataset)
        #    if self.args.local_rank == -1
        #    else DistributedSampler(train_dataset) # This cucks the code up... something about missing default proc group. Prolly sumtin to do with concurrency.
        #)
        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
                train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator.collate_batch,
            ),
        )

        #if is_tf_available():
        #    data_loader = nn.pl.ParallelLoader(data_loader, [self.args.device]).per_device_loader(self.args.device)
        return data_loader
    
    def get_train_dataloader(self):
        loader = MultitaskDataloader({
            task_name: self.get_single_train_dataloader(task_name, dataset)
            for task_name, dataset in self.train_dataset.items()
        })
        
        return loader
            

    

In [None]:
train_dataset = {
    task_name: dataset["train"]
    for task_name, dataset in features_dict.items()
}
trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="models/multitask_model",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        num_train_epochs=6,
        per_device_train_batch_size=8,
        save_steps=5000,
    ),
    data_collator=NLPDataCollator(),
    train_dataset=train_dataset,
)


In [None]:
trainer.train()