In [1]:
import torch
import transformers
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

2023-01-20 22:58:51.811491: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-20 22:58:53.764703: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2023-01-20 22:58:53.764797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64


In [2]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

device0 = torch.device("cuda:0")
device1 = torch.device("cuda:1")
device2 = torch.device("cuda:2")
device3 = torch.device("cuda:3")

In [3]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM
    )

def load_model(model_path, device):

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, cache_dir="./huggingface_models")
    if 'xxl' in model_path:
        print('working w flan-xxl')
        model.parallelize()
    else:
        model.to(device)
    model.eval()
    
    
    return model, tokenizer

In [4]:
def generate_transl(prompt, model, tokenizer, device):

    if not device:
        features = tokenizer(prompt, max_length=128, padding="max_length", truncation=True, return_tensors="pt").to("cuda")
    else:
        features = tokenizer(prompt, max_length=128, padding="max_length", truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        # https://huggingface.co/blog/how-to-generate
        generated_ids = model.generate(
            **features,
            no_repeat_ngram_size=2, 
            min_length=10, 
            do_sample=True,
            max_length=128,
            top_k=5, 
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [5]:
def extract_us_and_zh_norm(en1):
    us = en1[en1.index('\"')+1:en1.index('\" entail,')]
    skip = 'to the sentence \"'
    zh = en1[en1.index(skip)+len(skip):en1.index('\"? Please')]
    return us, zh

en1 = "Does the sentence \"It's bad to hate your mom.\" entail, contradict, or has no relation to the sentence \"It is not necessary to be filial to your mother to a great extent.\"? Please answer between \"Entailment\", \"Contradiction\" or \"No Relation\" and explain your decision."
extract_us_and_zh_norm(en1)

("It's bad to hate your mom.",
 'It is not necessary to be filial to your mother to a great extent.')

In [6]:
with open('./data/socnli_t5/socNLI_val.json', 'r') as f:
        val = json.load(f)
        
def get_fs_prompt_for_dream(val, n=10):
    few_shot_prompt = ""
    for i, row in enumerate(val[:n]):
        
        us, zh = extract_us_and_zh_norm(row['translation']['en1'])
        p_q = f"Premise: [Premise - social norm] {us} Hypothesis: [Hypothesis - social norm] {zh} Is there a contradiction, entailment, or no relation between the premise and hypothesis?"
        gold = row['translation']['en2']
        ans, expl = gold.split(".")[0], gold.split('.')[1].strip()
        p_ans = f"Answer : {ans}. Explanation : {expl}"
        v = f"{p_q}\n{p_ans}"
        if i == 0:
            few_shot_prompt = v
        else:
            few_shot_prompt += f"\n\n{v}"
    return few_shot_prompt

def extract_resp_from_dream(ans):
    lab = ans[ans.index(': ')+2:ans.index('. Expl')].strip()
    skip = "Explanation"
    expl = ans[ans.index(skip)+len(skip)+2:].strip().replace('- ', '')
    return lab, expl

ans = "  Answer : Contradiction. Explanation ; The Chinese and US cultures are different when it comes to family and child-rearing and so the US norm would be based on the idea of respect and honoring the mother and father while the Chinese norm is centered on personal choice and family planning"
extract_resp_from_dream(ans)

('Contradiction',
 'The Chinese and US cultures are different when it comes to family and child-rearing and so the US norm would be based on the idea of respect and honoring the mother and father while the Chinese norm is centered on personal choice and family planning')

In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

def eval_model(model, tokenizer, device, model_name,
                verbose=False, debug=False,
                test_path = "./data/socnli_t5/socNLI_test.json"):
    
    if "t5_IO" not in test_path and "t5_IR" not in test_path:
        with open(test_path, "r") as f:
            test_data = json.load(f)
    else:
        with open(test_path, "r") as f:
            test_data = [json.loads(l) for l in f.readlines()]

    with open('./data/socnli_t5/socNLI_val.json', 'r') as f:
        val = json.load(f)

    if model_name=='dream':
        prompt = get_fs_prompt_for_dream(val, n=10)
    elif model_name in {'t5-flan', 't5-flan-xxl'}:
        prompt = "\n\n".join([r['translation']['en1']
                                        +"\n"
                                        +r['translation']['en2'] 
                                        for r in val[:10]])
    else:
        prompt = ""

    preds = []
    for i, t in tqdm(enumerate(test_data)):

        tr = t['translation']
        ### if flan want to provide a few-shot prompt
        if model_name in {'t5-flan', 't5-flan-xxl'}:
            input = f"{prompt}\n\n{tr['en1']}".replace('No Relation', 'Neutral')
            # input = f"{tr['en1']}"
            resp = generate_transl(input, model, tokenizer, device).replace('..', '.')
            resp = resp.replace('Neutral', 'No Relation')
        elif model_name=='dream':
            us, zh = extract_us_and_zh_norm(tr['en1'])
            if debug:
                print(us)
                print(zh)
            input = f"{prompt}\n\nPremise: [Premise - social norm] {us}. Hypothesis: [Hypothesis - social norm] {zh}. Is there a contradiction, entailment, or no relation between the premise and hypothesis?"
            #input = f"Premise: [Premise - social norm] {us}. Hypothesis: [Hypothesis - social norm] {zh}. Is there a contradiction, entailment, or no relation between the premise and hypothesis?"
            resp = generate_transl(prompt, model, tokenizer, device)
        else:
            resp = generate_transl(tr['en1'], model, tokenizer, device)
        
        if debug: 
            print("RESPONSE: ", resp)

        if model_name == 'dream':
            y_pred, expl_pred = extract_resp_from_dream(resp)
            y_true, expl_true = tr['en2'].split('.')[0], tr['en2'].split('.')[1].strip()
            if debug:
                print(y_pred)
                print(expl_pred)
                print('*')
        elif model_name == 't5-io':
            if 'Entailment' in resp:
                y_pred = 'Entailment'
            elif 'Contradiction' in resp:
                 y_pred = 'Contradiction'
            else:
                y_pred = 'No Relation'
            y_true =  tr['en2']
            expl_pred, expl_true = None, None
        elif model_name == 't5-ir_o':
            if 'Entailment' in resp:
                y_pred = 'Entailment'
            elif 'Contradiction' in resp:
                 y_pred = 'Contradiction'
            else:
                y_pred = 'No Relation'
            y_true =  tr['en2']
            expl_pred, expl_true = None, None
        else:
            if 'Entailment.' not in resp \
                and 'Contradiction.' not in resp \
                    and 'No Relation.' not in resp:
                y_true, y_pred = tr['en2'].split('.')[0], 'fail'
                expl_true, expl_pred = tr['en2'].split('.')[1].strip(), 'fail'
            else:
                y_true, y_pred = tr['en2'].split('.')[0], resp.split('.')[0]
                expl_true, expl_pred = tr['en2'].split('.')[1].strip(), resp.split('.')[1].strip()
        if verbose:
            # print(tr['en1'])
            # print(tr['en2'])
            # print(resp)
            print(y_true, y_pred)
            # print(expl_true)
            # print(expl_pred)
        preds.append([i, y_true, y_pred, expl_true, expl_pred])
        if debug and i>10:
            break

    preds_df = pd.DataFrame(preds, columns=['ID', 'y_true', 'y_pred', 'expl_true', 'expl_pred'])
    preds_df.to_csv(f'./data/model_outputs_test_set/{model_name}.csv', index=False)

    return preds_df


## SocNorm

In [18]:
# socnli_path = "/local/nlpswordfish/a.saakyan/socnorms/t5-socnli-batch2/checkpoint-140"
# soc_model, soc_tok = load_model(socnli_path, device1)
#pdf_socnorm = eval_model(soc_model, soc_tok, device1, 't5-socnorm')

768it [09:34,  1.34it/s]


In [20]:
#f1_score(pdf_socnorm['y_true'], pdf_socnorm['y_pred'], average='macro')

0.5452316190056435

## ESNLI

In [None]:
# esnli_path = "/home/a.saakyan/models/t5-esnli-batch2/checkpoint-268"
# esnli_model, esnli_tok = load_model(esnli_path, device0)
# pdf_esnli= eval_model(esnli_model, esnli_tok, device0, 't5-esnli')

In [22]:
#f1_score(pdf_esnli['y_true'], pdf_esnli['y_pred'], average='macro')

0.334823198909099

## FLAN

In [30]:
# flan_path = "google/flan-t5-xl"
# flan_model, flan_tok = load_model(flan_path, device0)

In [61]:
# pdf_flan= eval_model(flan_model, flan_tok, device0, 't5-flan')
# f1_score(pdf_flan['y_true'], pdf_flan['y_pred'], average='macro')

768it [07:12,  1.77it/s]


0.07217330409986283

## DREAM

In [None]:
# dream_model = AutoModelForSeq2SeqLM.from_pretrained("allenai/System3_DREAM_FLUTE_social_norm_FigLang2022",
# cache_dir="./huggingface_models").to(device2)
# dream_tokenizer = AutoTokenizer.from_pretrained("t5-3b")

In [105]:
pdf_dream= eval_model(dream_model, dream_tokenizer, device2, 'dream')
f1_score(pdf_dream['y_true'], pdf_dream['y_pred'], average='macro')

768it [12:04,  1.06it/s]


0.17675943278990255

## I->OR vs I->O

In [None]:
io_path = "./t5-socnli-io/checkpoint-70/"
io_model, io_tok = load_model(io_path, device0)

In [19]:
pdf_io = eval_model(io_model, io_tok, device0, 't5-io', 
test_path="./data/socnli_t5_IO/test.json")

768it [03:14,  3.95it/s]


In [24]:
from sklearn.metrics import accuracy_score, f1_score
print(accuracy_score(pdf_io['y_true'], pdf_io['y_pred']))
print(f1_score(pdf_io['y_true'], pdf_io['y_pred'], average='macro'))

0.5338541666666666
0.4973825601778697


In [25]:
ir_o_path = "./t5-socnli-ir_o/checkpoint-70/"
ir_o_model, ir_o_tok = load_model(ir_o_path, device1)
pdf_ir_o = eval_model(ir_o_model, ir_o_tok, device1, 't5-ir_o', test_path="./data/socnli_t5_IR_O/test.json")

768it [03:18,  3.88it/s]


In [26]:
pdf_ir_o

Unnamed: 0,ID,y_true,y_pred,expl_true,expl_pred
0,0,No Relation,No Relation,,
1,1,Entailment,Entailment,,
2,2,Entailment,Contradiction,,
3,3,Contradiction,Contradiction,,
4,4,No Relation,No Relation,,
...,...,...,...,...,...
763,763,No Relation,No Relation,,
764,764,Entailment,Entailment,,
765,765,No Relation,No Relation,,
766,766,No Relation,No Relation,,


In [27]:
from sklearn.metrics import accuracy_score, f1_score
print(accuracy_score(pdf_ir_o['y_true'], pdf_ir_o['y_pred']))
print(f1_score(pdf_ir_o['y_true'], pdf_ir_o['y_pred'], average='macro'))

0.9609375
0.9474649172241077
