In [1]:
import transformers

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

import pandas as pd
from pprint import pprint

In [None]:
context_templates = {
    1: [
        "concept <con> can be described as <prop_list>.",
        "concept <con> can be described as <predict_prop>.",
    ],
    2: [
        "concept <con> can be described as <prop_list>?",
        "<[MASK]>, concept <con> can be described as <predict_prop>.",
    ],
    3: [
        "concept <con> can be described as <predict_prop>?",
        "<[MASK]>, concept <con> can be described as <prop_list>.",
    ],
}

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")


def preprocess_dataset (concept_property_file, context_id = None):
    
    data_df = pd.read_csv(
                concept_property_file,
                sep="\t",
                header=None,
                names=["concept", "conjuct_prop", "predict_prop", "labels"],
                dtype={
                    "concept": str,
                    "conjuct_prop": str,
                    "predict_prop": str,
                    "labels": int,
                },
            )
    
    print (f"Mask Token : {tokenizer.mask_token}")
    
    # print (data_df.head(n=20))
    
    def preprocess_conjuct_prop (conjuct_props):
        
        if conjuct_props == "no_similar_property":
            conjuct_props = ""
        else:
            
            conjuct_props = conjuct_props.split(", ")

            if len(conjuct_props) >= 2:

                conjuct_props[-1] = "and " + conjuct_props[-1]
                conjuct_props = ", ".join(conjuct_props)
            else:
                conjuct_props = ", ".join(conjuct_props)
        
        return conjuct_props
    
    
    data_df["conjuct_prop"] = data_df["conjuct_prop"].apply(preprocess_conjuct_prop)

    # print (data_df.head(n=20))
    
    if context_id is not None:
        
        sent_1_template, sent_2_template = context_templates[context_id]
        
        print ("sent_1_template :", sent_1_template)
        print ("sent_2_template :", sent_2_template)

    
    
    def get_sent_1(template, concept, predict_prop):
        text = template.replace("<con>", concept).replace("<predict_prop>", predict_prop)
        
        return text
    
    def get_sent_2(template, concept, conjuct_props):
        text = template.replace("<[MASK]>", tokenizer.mask_token).replace("<con>", concept).replace("<prop_list>", conjuct_props)
            
        return text
        
    data_df["sent_1"] = data_df.apply(lambda x : get_sent_1(sent_1_template, x["concept"], x["predict_prop"]), axis=1)
    data_df["sent_2"] = data_df.apply(lambda x : get_sent_2(sent_2_template, x["concept"], x["conjuct_prop"]), axis=1)
    
    print (data_df[["sent_1", "sent_2"]].head(n=20))
    
    return data_df["sent_1"], data_df["sent_2"], data_df["labels"]
    
    

valid_file = "/home/amitgajbhiye/Downloads/embeddings_con_prop/deberta_nli_predict_prop_similar/sim3_deberta_nli_predict_prop_similar_5_neg_valid_mscg_cnetp.tsv"

sent_1, sent_2, labels = preprocess_dataset(valid_file, context_id=3)    
    

In [None]:
sent_1[0], sent_2[0], labels[0]

In [None]:
def preprocess_function(sent_1, sent_2):
    return tokenizer(sent_1, sent_2, truncation=True)

In [None]:
tok = preprocess_function(sent_1[0], sent_2[0])

In [None]:
tok

In [None]:
tokenizer.name_or_path

In [None]:
"roberta" in tokenizer.name_or_path

In [None]:
inp = tokenizer.encode_plus(sent_1[0], sent_2[0])

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

In [None]:
"roberta" in model.name_or_path

In [None]:
model()