In [2]:
import yaml
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import StoppingCriteria, StoppingCriteriaList

In [56]:
path_to_config = "../configs/gpt2-small-echo.yaml"

In [57]:
# open yaml config as a strema and load into config_dict
with open(path_to_config, "r") as stream:
    try:
        config_dict = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print("Configuration load failed!")
        print(exc)

In [58]:
device = torch.device("cuda")

In [59]:
# # Load a trained model and vocabulary that you have fine-tuned
model = GPT2LMHeadModel.from_pretrained(config_dict["output_model_dir"])
tokenizer = GPT2Tokenizer.from_pretrained(config_dict["output_tokenizer_dir"])


In [60]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [31]:
alpha_text = "Instruction: The following is an interaction between a therapist and a client. Act as the therapist and give a reflection to the client's response. The reflection must be a statement and not a question. The reflection must be a rephrasing of the client's response.\nTherapist: It's great to hear you want to reduce your smoking. What would it look like when you have reduced your smoking addiction?\nClient: Better health condition, less money spent but more stress.\n"

In [32]:
beta_text = "Therapist: Now, what is one thing about your smoking habit that you would like to change?\nClient: I'd like to quit completely. Maybe not quit entirely, but go down to 1 a day or only smoke during social gatherings.\n"

In [33]:
charlie_text = "Instruction: The following is an interaction between a therapist and a client. Act as the therapist and give a reflection to the client's response. The reflection must be a statement and not a question. The reflection must be a rephrasing of the client's response.\n\n###\n\nTherapist: What else do you dislike about smoking?\nClient: The money i spend on cigarettes\n"

In [34]:
delta_text = "Instruction: The following is an interaction between a therapist and a client. Act as the therapist and give a reflection to the client's response. The reflection must be a statement and not a question. The reflection must be a rephrasing of the client's response.\n\nTherapist: What else do you dislike about smoking?\nClient: The money i spend on cigarettes\n\n###\n\n"

In [35]:
echo_text = "### Instruction: The following is an interaction between a therapist and a client. Act as the therapist and give a reflection to the client's response. The reflection must be a statement and not a question. The reflection must be a rephrasing of the client's response.\n\n Conversation:\nTherapist: What else do you dislike about smoking?\nClient: The money i spend on cigarettes\n"

In [15]:
df = pd.read_csv("../data/instruct-reflections/holdout-set/holdout-instruct-question-answer.csv")

In [16]:
df.columns

Index(['PromptAndResponse', 'instruction', 'question', 'answer'], dtype='object')

In [17]:
for idx, row in df.iterrows():
    alpha = "Instruction: " + row["instruction"] + "\n"  "Therapist: " + row["question"] + "\n" + "Client: " + row["answer"] + "\n"
    beta = "Therapist: " + row["question"] + "\n" + "Client: " + row["answer"] + "\n"
    charlie = "Instruction: " + row["instruction"] + "\n\n###\n\n"  "Therapist: " + row["question"] + "\n" + "Client: " + row["answer"] + "\n"
    delta = "instruction: " + row["instruction"] + "\n\n" + "Therapist: " + row["question"] + "\n" + "Client: " + row["answer"] + "\n\n###\n\n\n"
    echo = "### Instruction:\n" + row["instruction"] + "\n\n" + "### Conversation:\n" + "Therapist: " + row["question"] + "\n" + "Client: " + row["answer"] + "\n"
    
    df.loc[idx, 'alphaInput'] = alpha
    df.loc[idx, 'betaInput'] = beta
    df.loc[idx, 'charlieInput'] = charlie
    df.loc[idx, 'deltaInput'] = delta
    df.loc[idx, 'echoInput'] = echo

In [18]:
'''
stop_criteria.py contains the class definition for StopTokenCriteria. 
'''

# The model.generate() function accepts as an argument a child of the abstract base class StoppingCriteria
# that defines the stopping criteria of the autoregressive loop embedded in model.generate(). The criteria defined
# is evaluated after each token is generated.
# StopTokenCriteria defines a callable object StoppingCriteria that stops (returns true) when a \n character is the 
# latest character generated and the string "Reflection:" has been previously generated.
class StopTokenCriteria(StoppingCriteria):
    
    def __init__(self, stop_token, tokenizer):
        self.stop_token = stop_token
        self.tokenizer = tokenizer
    
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        
        # if input_ids[-1][-1] (the latest token generated) is equal to the stop token
        stopGenerate = self.tokenizer.decode(input_ids[-1][-1]) == self.stop_token
        
        # if stopGenerate AND if the string "Reflection:" has been generated since the beginning of the
        # autoregressive loop
        stopGenerate = stopGenerate and "Therapist:" in self.tokenizer.decode(input_ids[-1])
        
        return stopGenerate

In [19]:
def inference_model(model, tokenizer, config_dict, text, stopping_token="\n"):
    model.eval()
    generation_config = config_dict['refgen']
    config = config_dict
    use_stopping_criteria = True


    # encode the input text into tokens using the tokenizer
    tokenized_text = tokenizer.encode(
        text, return_tensors="pt", padding=True, truncation=True
    )
    encodings_dict = tokenizer(text, truncation=True, max_length=256, padding="max_length")

    input_ids = torch.tensor(encodings_dict['input_ids'])
    input_ids = tokenized_text.to(device)

    # sample model with generate() using no tokens, just let it generate
    with torch.no_grad():
        sample_outputs = model.generate(input_ids,
                                        bos_token_id=tokenizer.bos_token_id,
                                        temperature = 0.8,
                                        # flag to use a sampling technique or greedy
                                        do_sample=generation_config['do_sample'],
                                        # penalize model for duplicating words
                                        repetition_penalty = 1.1,
                                        pad_token_id=tokenizer.pad_token_id,
                                        # of proposed words, only select from top k of them
                                        top_k=generation_config['top_k'],
                                        # max amount of tokens to generate
                                        max_length=256,
                                        stopping_criteria=StoppingCriteriaList([StopTokenCriteria(stopping_token, tokenizer)] if use_stopping_criteria else []),
                                        # of propsed words, select from the words that add up to top_p value
                                        # e.g. top_p=0.26 x(0.15),y(0.1),z(0.05)
                                        # only select from x and y (0.15+0.1+0.05=0.3 which is too high)
                                        top_p=generation_config['top_p'],
                                        # num of independently computed returned sequences for each element in the batch.
                                        num_return_sequences=1
                                       )
        output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
    return output

In [53]:
path_to_config

'../configs/gpt2-small-delta.yaml'

In [62]:
for idx, row in df.iterrows():
    print(f"{idx}: {len(df)}")
    text = row['echoInput']

    out = inference_model(model, tokenizer, config_dict, text)
    out = out.replace(text, '')
    out = out.replace("Therapist: ", '')
    out = out.replace("\n", '')
    #print(out)
    
    #if idx == 50: break
    df.loc[idx, 'echoReflection'] = out

0: 1321
1: 1321
2: 1321
3: 1321
4: 1321
5: 1321
6: 1321
7: 1321
8: 1321
9: 1321
10: 1321
11: 1321
12: 1321
13: 1321
14: 1321
15: 1321
16: 1321
17: 1321
18: 1321
19: 1321
20: 1321
21: 1321
22: 1321
23: 1321
24: 1321
25: 1321
26: 1321
27: 1321
28: 1321
29: 1321
30: 1321
31: 1321
32: 1321
33: 1321
34: 1321
35: 1321
36: 1321
37: 1321
38: 1321
39: 1321
40: 1321
41: 1321
42: 1321
43: 1321
44: 1321
45: 1321
46: 1321
47: 1321
48: 1321
49: 1321
50: 1321
51: 1321
52: 1321
53: 1321
54: 1321
55: 1321
56: 1321
57: 1321
58: 1321
59: 1321
60: 1321
61: 1321
62: 1321
63: 1321
64: 1321
65: 1321
66: 1321
67: 1321
68: 1321
69: 1321
70: 1321
71: 1321
72: 1321
73: 1321
74: 1321
75: 1321
76: 1321
77: 1321
78: 1321
79: 1321
80: 1321
81: 1321
82: 1321
83: 1321
84: 1321
85: 1321
86: 1321
87: 1321
88: 1321
89: 1321
90: 1321
91: 1321
92: 1321
93: 1321
94: 1321
95: 1321
96: 1321
97: 1321
98: 1321
99: 1321
100: 1321
101: 1321
102: 1321
103: 1321
104: 1321
105: 1321
106: 1321
107: 1321
108: 1321
109: 1321
110: 1321


In [63]:
df

Unnamed: 0,PromptAndResponse,instruction,question,answer,alphaInput,betaInput,charlieInput,deltaInput,echoInput,alphaReflection,betaReflection,charlieReflection,deltaReflection,echoReflection
0,BOT: What will it look like when you have made...,The following is an interaction between a ther...,What will it look like when you have made this...,"well, i think , i´ll leave the addiction progr...",Instruction: The following is an interaction b...,Therapist: What will it look like when you hav...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,Instruction: The following is an interaction b...,What will it look like when you have made this...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:The following is an interactio...
1,BOT: What will it look like when you have made...,The following is an interaction between a ther...,What will it look like when you have made this...,weird but healthier,Instruction: The following is an interaction b...,Therapist: What will it look like when you hav...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,It seems that the change in your smoking habit...,It seems that you are uncertain about the heal...,It seems that you believe making changes to yo...,It sounds like you believe that making this ch...,It sounds like you're not sure what making a c...
2,BOT: What will it look like when you have made...,The following is an interaction between a ther...,What will it look like when you have made this...,"unfortunately, I can't imagine that",Instruction: The following is an interaction b...,Therapist: What will it look like when you hav...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,It seems that you're currently experiencing di...,It seems that you're uncertain about how this ...,It seems that you're uncertain about what the ...,It seems that you are uncertain about how the ...,It seems that you're uncertain about how the c...
3,BOT: What will it look like when you have made...,The following is an interaction between a ther...,What will it look like when you have made this...,"the same, as I do not desire a change",Instruction: The following is an interaction b...,Therapist: What will it look like when you hav...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,You feel that things will essentially remain t...,You feel that there won't be any significant o...,You feel that making a change in your smoking ...,You feel that there won't be any noticeable di...,It sounds like you're not interested in changi...
4,BOT: What will it look like when you have made...,The following is an interaction between a ther...,What will it look like when you have made this...,the same,Instruction: The following is an interaction b...,Therapist: What will it look like when you hav...,Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,"So, you feel as though there won't be any noti...",You feel that things will essentially remain t...,It seems that you feel there won't be any noti...,You feel that things will essentially remain t...,You feel that things will essentially remain t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1316,"BOT: Finally, what are the steps you need to t...",The following is an interaction between a ther...,"Finally, what are the steps you need to take t...","At the beginning, I try to get off the dose of...",Instruction: The following is an interaction b...,"Therapist: Finally, what are the steps you nee...",Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,It sounds like you're planning to at the begin...,It sounds like you're planning to avoid the ha...,It sounds like you're planning to leave the nu...,You plan to start by reducing the number of ci...,It sounds like you're considering exploring al...
1317,"BOT: Finally, what are the steps you need to t...",The following is an interaction between a ther...,"Finally, what are the steps you need to take t...",Actually just to do it,Instruction: The following is an interaction b...,"Therapist: Finally, what are the steps you nee...",Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,It sounds like you're ready to take action and...,You just need to be ready to begin this change...,It sounds like you're ready to take action and...,It sounds like you're ready to take action and...,It sounds like you're truly determined to quit...
1318,"BOT: Finally, what are the steps you need to t...",The following is an interaction between a ther...,"Finally, what are the steps you need to take t...",Absolutely no steps,Instruction: The following is an interaction b...,"Therapist: Finally, what are the steps you nee...",Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,You don't have any specific steps in mind for ...,You recognize that fully committing to the pla...,You recognize that none of the steps are neces...,You are fully certain about the necessary step...,You don't have any specific steps in mind for ...
1319,"BOT: Finally, what are the steps you need to t...",The following is an interaction between a ther...,"Finally, what are the steps you need to take t...",1. Don't buy cigarettes. 2. smoke less,Instruction: The following is an interaction b...,"Therapist: Finally, what are the steps you nee...",Instruction: The following is an interaction b...,instruction: The following is an interaction b...,### Instruction:\nThe following is an interact...,You plan to stop purchasing cigarettes as a st...,You plan to reduce your cigarette consumption ...,You plan to reduce the cost of cigarettes and ...,You are determined to stop purchasing cigarett...,You plan to stop purchasing cigarettes as a st...


In [17]:
text = "ABC is a startup based in New York City and Paris"

# encode the input text into tokens using the tokenizer
tokenized_text = tokenizer.encode(
    text, return_tensors="pt", padding=True, truncation=True
)
encodings_dict = tokenizer(text, truncation=True, max_length=256, padding="max_length")
input_ids = torch.tensor(encodings_dict['input_ids'])
input_ids = tokenized_text.to(device)

In [22]:
encodings_dict

NameError: name 'encodings_dict' is not defined

In [20]:
with torch.no_grad():   
    loss = model(input_ids = input_ids, labels = input_ids).loss
ppl = torch.exp(loss)
print(ppl)

tensor(2.2428e+08, device='cuda:0')


In [21]:
inputs = tokenizer("ABC is a startup based in New York City and Paris", return_tensors = "pt")
loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
ppl = torch.exp(loss).detach()
print(ppl)


inputs_wiki_text = tokenizer("Generative Pretrained Transformer is an opensource artificial intelligence created by OpenAI in February 2019", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss).detach()
print(ppl)

inputs_wiki_text = tokenizer("Instruction", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss).detach()
print(ppl)

inputs_wiki_text = tokenizer("RABBA RABBA RABBA", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss).detach()
print(ppl)

tensor(2.2428e+08)
tensor(8.8227e+08)
tensor(1.)
tensor(7.0527e+09)


In [43]:
from datasets import load_dataset
from datasets import Dataset

In [44]:
dataset = Dataset.from_pandas(df)

In [None]:
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

In [None]:
max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())