#This Pipeline sets up batch inference for DialoGPT-large and BlenderBot 1.0

Installing huggingface transformers for blenderbot & dialoGPT \
Installing sentence transformers for sentence similarity scores

In [None]:
!pip3 install transformers
!pip3 install -U sentence-transformers



In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import torch
import csv
import pandas as pd
import json
import time

device = 'cuda' if torch.cuda.is_available else 'cpu'
# device = 'cpu'
print(device)

cuda


## Mount Google Drive and go to SALT Lab folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = 'drive/MyDrive/Colab Notebooks/SALT Lab' 
%cd $path

/content/drive/MyDrive/Colab Notebooks/SALT Lab


# Load CSVs as pandas dataframes
Standard_contexts = unedited contexts \
perturbed_contexts = AAVE contexts \
short_contexts = shorter list of contexts \

In [None]:
standard_path = 'data/reddit_filtered.csv'
perturbed_path = 'data/reddit_filtered.csv'
standard_df = pd.read_csv(standard_path, index_col = 0)
perturbed_df = pd.read_csv(perturbed_path, index_col = 0)

In [None]:
standard_contexts = standard_df['context']
perturbed_contexts = perturbed_df['context']
short_contexts = standard_df['context'][0:10]

#Set up DialoGPT-large model using huggingface
##function get_dialogpt_outputs()
@param num_samples: number of samples to input \
@param num_copies: number of outputs per sample \
@param temp: temperature \
@output all outputs (array of size num_samples x num_copies)

In [None]:
Dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
Dialogpt_tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
Dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large").to(device=device)

In [None]:
def get_dialogpt_outputs(num_samples, num_copies, temp):
  # Test out how long the function takes
  # start_time = time.time()

  batch_size = 5
  all_outputs = []
  for step in tqdm(range(batch_size, num_samples + 1, batch_size)):
      outputs = []
      # pull the batched inputs + append EOS token to the end of each input
      inputs = list(standard_contexts[step - batch_size:step])
      inputs = [i + Dialogpt_tokenizer.eos_token for i in inputs]

      # encode the inputs
      input_info = Dialogpt_tokenizer(inputs, padding = True, return_tensors = 'pt').to(device=device)
      input_ids = input_info['input_ids']
      attention_mask = input_info['attention_mask']

      # generated a num_samples * num_copies responses
      chat_history_ids = Dialogpt_model.generate(
          input_ids, 
          max_length=1000, 
          do_sample = True,
          # top_k=50, 
          # top_p=0.95,
          temperature = temp,
          num_return_sequences = num_copies,
          pad_token_id = Dialogpt_tokenizer.eos_token_id,
          attention_mask = attention_mask)
      # output = Dialogpt_tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
      for i in chat_history_ids:
        output = Dialogpt_tokenizer.decode(i[input_ids.shape[-1]:], skip_special_tokens=True)
        # print(output)
        outputs.append(output)
        # pretty print last ouput tokens from bot
        # print("Input: " + str(standard_contexts[step]))
        # print("DialoGPT: {}".format(Dialogpt_tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
      outputs = [outputs[x : x+num_copies] for x in range(0, len(outputs), num_copies)]
      all_outputs.extend(outputs)
      # print("Took " + str(time.time() - start_time) + " seconds to run")
  return all_outputs



##DialoGPT sanity test

In [None]:
print(get_dialogpt_outputs(5, 5, 0.5))

100%|██████████| 1/1 [00:05<00:00,  5.51s/it]

[["I don't know. I have social anxiety and I'm not American.", "It's just a thing that's been around for a long time.", 'Social anxiety is a symptom of depression', "Yes, it's an American thing.", "It's a social thing."], ["No, he's related to Mr. Riddle", 'No, just a very good artist.', 'No, but his mother is.', "No, but that's a great name for a band.", 'Nope, just a random name.'], ['Hue hue!', 'Gracias!!', 'Gracias!', 'Aqui no te preocupes?', 'Gracias!'], ['I wonder if he was a teacher at the same school as the guy who signed the letter.', 'I know. I was surprised that they signed a letter.', 'I thought he was a teacher?', 'I wonder how much that teacher got for that...', 'Khabib is a fighter, not a teacher.'], ["I think you're the one with the most comments this week.", "I'm a fan of your comment karma", 'I was the first to comment on this.', 'I thought I was the only one.', "No you're the one who posted the picture."]]





#Set up Blenderbot-1B-distill using huggingface
##function get_blenderbot_outputs()
@param num_samples: number of samples to input \
@param num_copies: number of outputs per sample \
@param temp: temperature \
@output all outputs (array of size num_samples x num_copies)

In [None]:
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

blenderbot_path = "facebook/blenderbot-1B-distill"
blenderbot_tokenizer = BlenderbotTokenizer.from_pretrained(blenderbot_path)
blenderbot_model = BlenderbotForConditionalGeneration.from_pretrained(blenderbot_path).to(device=device)

In [None]:
# Let's chat for 5 lines
import time
def get_blenderbot_outputs(num_samples, num_copies, temp):
  start_time = time.time()
  all_outputs = []
  batch_size = 2
  for step in tqdm(range(batch_size, num_samples + 1, batch_size)):
    outputs = []
    # Pull batch_size number of inputs and preprocess them
    samples = list(standard_contexts[step - batch_size : step])
    input_info = blenderbot_tokenizer(samples, padding = True, return_tensors='pt').to(device=device)
    input_ids = input_info['input_ids']
    attention_mask = input_info['attention_mask']

    # generated batch_size x num_copies responses
    chat_history_ids = blenderbot_model.generate(
        input_ids, 
        max_length=1000, 
        do_sample = True,
        # top_k = 50, 
        # top_p = 0.95,
        temperature = temp,
        num_return_sequences = num_copies,
        attention_mask = attention_mask)
    
    # Decode outputs
    for i in chat_history_ids:
      output = blenderbot_tokenizer.decode(i, skip_special_tokens=True)
      outputs.append(output)
    # Separate outputs into batch_size x num_copies (they come in one big array)
    outputs = [outputs[x : x + num_copies] for x in range(0, len(outputs), num_copies)]
    all_outputs.extend(outputs)

  # print("my program took " + str(time.time() - start_time) + " seconds to run")
  return all_outputs

## Blenderbot sanity test

In [None]:
print(get_blenderbot_outputs(2, 10, 1))

100%|██████████| 1/1 [00:14<00:00, 14.28s/it]

[[" I'm not sure, but I do know that social anxiety is a fear of social situations.", " I'm not sure, but I think it has something to do with the fact that a lot of people are introverted.", " I'm not really sure, but I do know that it is a mental disorder where people feel anxious in social situations.", " I'm not sure, but I do know that social anxiety is a fear of social interaction.", " I'm not sure, but I do know that social anxiety is a fear of social situations.", " I'm not sure, but I do know that social anxiety is a mental disorder characterized by fear of social situations.", " I'm not sure, but I do know it can be caused by a combination of genetic and environmental factors.", " I'm not sure, but I do know that social anxiety disorder is a mental disorder characterized by excessive fear of social situations.", " I'm not sure, but I do know that social anxiety is a fear of social situations.", " I'm not sure, but I do know that social anxiety disorder is a mental disorder."],




#Dialog model temperature volatility tests
In this section we will test how diverse model outputs are at different temperature values. \
We will use BLEU and sentence embedding similarites to gauge model output. \ 

In [None]:
from itertools import combinations
import nltk
from sentence_transformers import SentenceTransformer, util

semantic_similarity_model = SentenceTransformer('all-mpnet-base-v2')

##BLEU and Semantic Similarity Sanity Tests (Just make sure they're working)

In [None]:
def bleu_sanity_test():
  reference = "I think I might have to go with you."
  hypothesis = "I think I might have to go with poo."
  score = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
  return score
def semantic_similarity_sanity_test():
  reference = "I think I might have to go with you."
  hypothesis = "I think I might have to go with poo."
  sentence_embeddings = semantic_similarity_model.encode([reference, hypothesis])
  score = float(util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]))
  return score

print(bleu_sanity_test())
print(semantic_similarity_sanity_test())

0.897426966486071
0.32228589057922363


In [None]:
def ss_tests():
  test = ["I'm not sure what you're trying to say.", "You're a good man.", "I'll be your new best friend.", 'You have a lot of comments.', "It's a bot.", 'I know that feel bro.', 'You were the one that said it, though.', "This is why I don't post on this sub anymore.", "I'm so happy you're back! We all miss you.", 'I got you fam.']  
  for i in range(len(test) - 1):
    print(test[i])
    print(test[i + 1])
    reference = test[i]
    hypothesis = test[i + 1]
    bleu_score = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    sentence_embeddings = semantic_similarity_model.encode([reference, hypothesis])
    ss_score = float(util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]))
    print("bleu score = " + str(bleu_score))
    print("ss_score = " + str(ss_score))
ss_tests()


I'm not sure what you're trying to say.
You're a good man.
bleu score = 0.10529499920771897
ss_score = 0.08433771133422852
You're a good man.
I'll be your new best friend.
bleu score = 0.4057108879449503
ss_score = 0.20012082159519196
I'll be your new best friend.
You have a lot of comments.
bleu score = 0.5020923136190463
ss_score = 0.19615882635116577
You have a lot of comments.
It's a bot.
bleu score = 0.09213888389040659
ss_score = 0.17537584900856018
It's a bot.
I know that feel bro.
bleu score = 0.38260294162784475
ss_score = 0.10398069024085999
I know that feel bro.
You were the one that said it, though.
bleu score = 0.15257340614701648
ss_score = 0.13090452551841736
You were the one that said it, though.
This is why I don't post on this sub anymore.
bleu score = 0.27486893254130634
ss_score = 0.1572614163160324
This is why I don't post on this sub anymore.
I'm so happy you're back! We all miss you.
bleu score = 0.5203529238931589
ss_score = 0.12132645398378372
I'm so happy you'

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


bleu score = 0.030983872032022023
ss_score = 0.15337564051151276


#Bleu & semantic similarity functions on lists of num_samples x num_copies
function bleu() \
@return the average pairwise BLEU score \

function semantic_similarity() \
@return the average pairwise semantic similarity (based on sentence embeddings



In [None]:
def list_to_json(data, name):
  with open(name, "w") as f:
    json.dump(data, f)
  print("saved as json")

In [None]:
def bleu(all_outputs):
  scores = []
  for i in all_outputs:
    pairs = list(combinations(i, 2))
    for i in pairs:
      ref, trans = i
      score = nltk.translate.bleu_score.sentence_bleu([ref], trans)
      scores.append(score)
  return scores, sum(scores) / len(scores)

def semantic_similarity(all_outputs):
  scores = []

  for i in all_outputs:
    sentence_embeddings = semantic_similarity_model.encode(i)
    pairs = list(combinations(sentence_embeddings, 2))
    for i in pairs:
      a, b = i
      score = float(util.pytorch_cos_sim(a, b))
      scores.append(score)
  return scores, sum(scores) / len(scores)
def pairwise_bleu(first, second):
  avg_scores = []
  for i in first:
    scores = []
    for j in second:
      score = nltk.translate.bleu_score.sentence_bleu([i], j)
      scores.append(score)
    avg_scores.append(sum(scores) / len(scores))
  return sum(avg_scores) / len(avg_scores)
def pairwise_semantic_similarity(first, second):
  avg_scores = []
  first_sentence_embeddings = semantic_similarity_model.encode(first)
  second_sentence_embeddings = semantic_similarity_model.encode(second)
  for i in first_sentence_embeddings:
    scores = []
    for j in second_sentence_embeddings:
      score = float(util.pytorch_cos_sim(i, j))
      scores.append(score)
    avg_scores.append(sum(scores) / len(scores))
  return sum(avg_scores) / len(avg_scores)
# all_outputs = [["hi how are you doing today", "hello how is your mother", "you looking good today ;)"]]

## Model_volatility gets all SS/BLEU similarity scores from DialoGPT or Blenderbot.

In [None]:
def model_volatility(num_samples, num_copies, max_temp):
  dialogpt_bleu_scores = []
  dialogpt_bleu_avgs = []

  dialogpt_ss_scores = []
  dialogpt_ss_avgs = []

  blenderbot_bleu_scores = []
  blenderbot_bleu_avgs = []

  blenderbot_ss_scores = []
  blenderbot_ss_avgs = []
  # for temp in range(2, int(7 * max_temp)):
  #   temp = temp / 10
  #   dialogpt_outputs = get_dialogpt_outputs(num_samples, num_copies, temp)

  #   dialogpt_bleu_score, dialogpt_bleu_avg = bleu(dialogpt_outputs)
  #   dialogpt_bleu_scores.append(dialogpt_bleu_score)
  #   dialogpt_bleu_avgs.append(dialogpt_bleu_avg)

  #   dialogpt_ss_score, dialogpt_avg = semantic_similarity(dialogpt_outputs)
  #   dialogpt_ss_scores.append(dialogpt_ss_score)
  #   dialogpt_ss_avgs.append(dialogpt_avg)

  # list_to_json(dialogpt_bleu_scores, "dialogpt_bleu_scores.json")
  # list_to_json(dialogpt_bleu_avgs, "dialogpt_bleu_avgs.json")

  # list_to_json(dialogpt_ss_scores, "dialogpt_ss_scores.json")
  # list_to_json(dialogpt_ss_avgs, "dialogpt_ss_avgs.json")

  for temp in range(8, int(10 * max_temp)):

    temp = temp / 10
    blenderbot_outputs = get_blenderbot_outputs(num_samples, num_copies * 2, temp)

    blenderbot_bleu_score, blenderbot_bleu_avg = bleu(blenderbot_outputs)
    blenderbot_bleu_scores.append(blenderbot_bleu_score)
    blenderbot_bleu_avgs.append(blenderbot_bleu_avg)

    blenderbot_ss_score, blenderbot_ss_avg = semantic_similarity(blenderbot_outputs)
    blenderbot_ss_scores.append(blenderbot_ss_score)
    blenderbot_ss_avgs.append(blenderbot_ss_avg)

  list_to_json(blenderbot_bleu_scores, "blenderbot_bleu_scores.json")
  list_to_json(blenderbot_bleu_avgs, "blenderbot_bleu_avgs.json")

  list_to_json(blenderbot_ss_scores, "blenderbot_ss_scores.json")
  list_to_json(blenderbot_ss_avgs, "blenderbot_ss_avgs.json")

model_volatility(60, 10, 1.7)

NameError: ignored

## Pairwise_model_volatility gets normalized scores (average of all pairwise scores on 5 inputs of A and 5 inputs of B) on DialoGPT and Blenderbot

In [None]:
def pairwise_model_volatility(num_samples, num_copies, max_temp):
  dialogpt_bleu_avgs = []
  dialogpt_ss_avgs = []
  dialogpt_all_outputs = []
  for temp in range(3, max_temp):
    dialogpt_temp_bleu_avgs = []
    dialogpt_temp_ss_avgs = []

    temp = temp / 10
    dialogpt_outputs = get_dialogpt_outputs(num_samples, num_copies * 2, temp)
    dialogpt_all_outputs.append(dialogpt_outputs)
    for i in dialogpt_outputs:
      first = i[0:num_copies]
      second = i[num_copies:]
      dialogpt_bleu_avg = pairwise_bleu(first, second)
      dialogpt_temp_bleu_avgs.append(dialogpt_bleu_avg)

      dialogpt_ss_avg = pairwise_semantic_similarity(first, second)
      dialogpt_temp_ss_avgs.append(dialogpt_ss_avg)

    dialogpt_bleu_avgs.append(dialogpt_temp_bleu_avgs)
    dialogpt_ss_avgs.append(dialogpt_temp_ss_avgs)

  list_to_json(dialogpt_bleu_avgs, "data/dialogpt_bleu_pairwise_avgs.json")
  list_to_json(dialogpt_ss_avgs, "data/dialogpt_ss_pairwise_avgs.json")
  list_to_json(dialogpt_all_outputs, "data/dialogpt_pairwise_outputs.json")
  # blenderbot_bleu_avgs = []
  # blenderbot_ss_avgs = []

  # for temp in range(11, max_temp):
  #   blenderbot_temp_bleu_avgs = []
  #   blenderbot_temp_ss_avgs = []

  #   temp = temp / 10
  #   blenderbot_outputs = get_blenderbot_outputs(num_samples, num_copies * 2, temp)

  #   for output in blenderbot_outputs:
  #     first = output[0:num_copies]
  #     second = output[num_copies:]
  #     blenderbot_bleu_avg = pairwise_bleu(first, second)
  #     blenderbot_temp_bleu_avgs.append(blenderbot_bleu_avg)

  #     blenderbot_ss_avg = pairwise_semantic_similarity(first, second)
  #     blenderbot_temp_ss_avgs.append(blenderbot_ss_avg)

  #   blenderbot_bleu_avgs.append(blenderbot_temp_bleu_avgs)
  #   blenderbot_ss_avgs.append(blenderbot_temp_ss_avgs)

  # list_to_json(blenderbot_bleu_avgs, "data/blenderbot_bleu_pairwise_avgs.json")
  # list_to_json(blenderbot_ss_avgs, "data/blenderbot_ss_pairwise_avgs.json")

pairwise_model_volatility(1000, 5, 6)

100%|██████████| 200/200 [11:16<00:00,  3.38s/it]
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
100%|██████████| 200/200 [10:25<00:00,  3.13s/it]
100%|██████████| 200/200 [10:33<00:00,  3.17s/it]


saved as json
saved as json
saved as json


In [None]:
from matplotlib import pyplot as plt

def data_viz(scores, start_temp):
  y = scores
  x = (np.arange(len(y)) / 10) + start_temp
  plt.plot(x, y)
  plt.ylabel("# of subreddits")
  plt.xlabel("subreddit size")
  plt.show()
  