In [2]:
import torch
import pandas as pd
import transformers

from torch.utils.data import Dataset, DataLoader, DataLoader
from transformers import (
    RobertaModel,
    RobertaTokenizerFast,
    RobertaForSequenceClassification, TrainingArguments, Trainer
)
from transformers import AutoModel, AutoTokenizer
from torch import nn

from pprint import pprint

In [None]:
context_templates = {
    1: [
        "concept <con> can be described as <prop_list>.",
        "concept <con> can be described as <predict_prop>.",
    ],
    2: [
        "concept <con> can be described as <prop_list>?",
        "<[MASK]>, concept <con> can be described as <predict_prop>.",
    ],
    3: [
        "concept <con> can be described as <predict_prop>?",
        "<[MASK]>, concept <con> can be described as <prop_list>.",
    ],
}


def preprocess_dataset(data_df, tokenizer, context_id = 3):
    
    print (f"Mask token : {tokenizer.mask_token}")
    print (f"Mask token ID : {tokenizer.mask_token_id}")

    def preprocess_conjuct_prop (conjuct_props):

        if conjuct_props == "no_similar_property":
            conjuct_props = ""
        else:

            conjuct_props = conjuct_props.split(", ")

            if len(conjuct_props) >= 2:

                conjuct_props[-1] = "and " + conjuct_props[-1]
                conjuct_props = ", ".join(conjuct_props)
            else:
                conjuct_props = ", ".join(conjuct_props)

        return conjuct_props


    data_df["conjuct_prop"] = data_df["conjuct_prop"].apply(preprocess_conjuct_prop)

    # print (data_df.head(n=20))

    if context_id is not None:

        sent_1_template, sent_2_template = context_templates[context_id]

        print ("sent_1_template :", sent_1_template)
        print ("sent_2_template :", sent_2_template)


    def get_sent_1(template, concept, predict_prop):

        text = template.replace("<con>", concept).replace("<predict_prop>", predict_prop)
        return text

    def get_sent_2(template, concept, conjuct_props):

        text = template.replace("<[MASK]>", tokenizer.mask_token).replace("<con>", concept).replace("<prop_list>", conjuct_props)
        return text

    data_df["sent_1"] = data_df.apply(lambda x : get_sent_1(sent_1_template, x["concept"], x["predict_prop"]), axis=1)
    data_df["sent_2"] = data_df.apply(lambda x : get_sent_2(sent_2_template, x["concept"], x["conjuct_prop"]), axis=1)

    # print (self.data_df[["sent_1", "sent_2"]].head(n=20))

    print(data_df["sent_1"], data_df["sent_2"], data_df["labels"])

    return data_df

class ConceptPropertyAugmentationDataset(Dataset):
    
    def __init__(self, data_file, max_len = 60, context_id=3):
        
        self.data_df = pd.read_csv(
                    data_file,
                    sep="\t",
                    header=None,
                    names=["concept", "conjuct_prop", "predict_prop", "labels"],
                    dtype={
                        "concept": str,
                        "conjuct_prop": str,
                        "predict_prop": str,
                        "labels": float,
                    },
                )[0:100]

        
        self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
        self.max_len = max_len
        self.context_id = context_id
        self.mask_token = self.tokenizer.mask_token
        
        self.data_df = preprocess_dataset(data_df=self.data_df, tokenizer=self.tokenizer, context_id=self.context_id)
        
        # print (self.data_df)
        
    
    def __len__(self):
        
        return (len(self.data_df))
    
    def __getitem__(self, idx):
        
        
        sent_1 = self.data_df["sent_1"][idx]
        sent_2 = self.data_df["sent_2"][idx]
        
        labels = self.data_df["labels"][idx]
        
        encoded_dict = self.tokenizer.encode_plus(
            text=sent_1,
            text_pair=sent_2,
            max_length=self.max_len,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_token_type_ids=False,
        )

        
        encoded_dict["labels"] = torch.tensor(labels)
        print (encoded_dict)
        
        print (encoded_dict["input_ids"].shape)
        print (encoded_dict["attention_mask"].shape)
        print (encoded_dict["labels"].shape)
                
        return encoded_dict
    


# train_file = ""
valid_file = "/home/amitgajbhiye/Downloads/embeddings_con_prop/deberta_nli_predict_prop_similar/sim3_deberta_nli_predict_prop_similar_5_neg_valid_mscg_cnetp.tsv"

valid_data = ConceptPropertyAugmentationDataset(data_file = valid_file)
    

In [None]:
class ModelPropConjuctionJoint(nn.Module):
    
    def __init__(self):
        super(ModelPropConjuctionJoint, self).__init__()


        # self.encoder = RobertaModel.from_pretrained("MOdel Path")
        self.encoder = AutoModel.from_pretrained("roberta-base")
        
        self.dropout = nn.Dropout(self.encoder.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        
        
        input_ids = input_ids.squeeze()
        attention_mask = attention_mask.squeeze()
        
        
        print (f"input_ids : {input_ids.shape}")
        print (f"attention_mask : {attention_mask.shape}")
        print (f"labels : {labels.shape}")

        loss_fct = nn.BCEWithLogitsLoss()

        output = self.encoder(
            input_ids=input_ids.squeeze(),
            attention_mask=attention_mask.squeeze())
        
        hidden_states = output.last_hidden_state
        
        print (f"hidden_states : {hidden_states.shape}")

        def get_mask_token_embeddings(last_layer_hidden_states):

            MASK_TOKEN_ID = 50264

            _, mask_token_index = (
                input_ids == torch.tensor(MASK_TOKEN_ID)
            ).nonzero(as_tuple=True)

            mask_vectors = torch.vstack(
                [
                    torch.index_select(v, 0, torch.tensor(idx))
                    for v, idx in zip(last_layer_hidden_states, mask_token_index)
                ]
            )

            return mask_vectors

        mask_vectors = get_mask_token_embeddings(
            last_layer_hidden_states=hidden_states
        )

        mask_vectors = self.dropout(mask_vectors)
        mask_logits = self.classifier(mask_vectors).view(-1)
        
        
        loss = None
        if labels is not None:
            labels = labels.view(-1).float()
            loss = loss_fct(mask_logits, labels)

        print ("loss :", loss)
        
        return (loss, mask_logits, mask_vectors)

In [None]:
metric_name = "accuracy"
model_name = "dummy_roberta_base"
batch_size = 16

model = ModelPropConjuctionJoint()

args = TrainingArguments(output_dir = ".",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=valid_data,
    eval_dataset=None,
    tokenizer=None,
    compute_metrics=None
)


trainer.train()

In [3]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [4]:
tokenizer.cls_token

'<s>'

context_templates = {
    1: [
        "concept <con> can be described as <prop_list>.",
        "concept <con> can be described as <predict_prop>.",
    ],
    2: [
        "concept <con> can be described as <prop_list>?",
        "<[MASK]>, concept <con> can be described as <predict_prop>.",
    ],
    3: [
        "concept <con> can be described as <predict_prop>?",
        "<[MASK]>, concept <con> can be described as <prop_list>.",
    ],
}

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def preprocess_dataset (concept_property_file, context_id = None):

    data_df = pd.read_csv(
                concept_property_file,
                sep="\t",
                header=None,
                names=["concept", "conjuct_prop", "predict_prop", "labels"],
                dtype={
                    "concept": str,
                    "conjuct_prop": str,
                    "predict_prop": str,
                    "labels": int,
                },
            )

    print (f"Mask Token : {tokenizer.mask_token}")

    # print (data_df.head(n=20))

    def preprocess_conjuct_prop (conjuct_props):

        if conjuct_props == "no_similar_property":
            conjuct_props = ""
        else:

            conjuct_props = conjuct_props.split(", ")

            if len(conjuct_props) >= 2:

                conjuct_props[-1] = "and " + conjuct_props[-1]
                conjuct_props = ", ".join(conjuct_props)
            else:
                conjuct_props = ", ".join(conjuct_props)

        return conjuct_props


    data_df["conjuct_prop"] = data_df["conjuct_prop"].apply(preprocess_conjuct_prop)

    # print (data_df.head(n=20))

    if context_id is not None:

        sent_1_template, sent_2_template = context_templates[context_id]

        print ("sent_1_template :", sent_1_template)
        print ("sent_2_template :", sent_2_template)


    def get_sent_1(template, concept, predict_prop):
        text = template.replace("<con>", concept).replace("<predict_prop>", predict_prop)

        return text

    def get_sent_2(template, concept, conjuct_props):
        text = template.replace("<[MASK]>", tokenizer.mask_token).replace("<con>", concept).replace("<prop_list>", conjuct_props)

        return text

    data_df["sent_1"] = data_df.apply(lambda x : get_sent_1(sent_1_template, x["concept"], x["predict_prop"]), axis=1)
    data_df["sent_2"] = data_df.apply(lambda x : get_sent_2(sent_2_template, x["concept"], x["conjuct_prop"]), axis=1)

    print (data_df[["sent_1", "sent_2"]].head(n=20))

    return data_df["sent_1"], data_df["sent_2"], data_df["labels"]

    

valid_file = "/home/amitgajbhiye/Downloads/embeddings_con_prop/deberta_nli_predict_prop_similar/sim3_deberta_nli_predict_prop_similar_5_neg_valid_mscg_cnetp.tsv"

sent_1, sent_2, labels = preprocess_dataset(valid_file, context_id=3)    
    

sent_1[0], sent_2[0], labels[0]

def preprocess_function(sent_1, sent_2):
    return tokenizer(sent_1, sent_2, truncation=True)

tok = preprocess_function(sent_1[0], sent_2[0])

tok

tokenizer.name_or_path

"roberta" in tokenizer.name_or_path

inp = tokenizer.encode_plus(sent_1[0], sent_2[0])

model = RobertaForSequenceClassification.from_pretrained("roberta-base")

"roberta" in model.name_or_path

model()