In [1]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Model
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from yellowbrick.cluster.elbow import kelbow_visualizer
import random

In [2]:
# Load word vectors
GloVe = {}
with open("glove.6B/glove.6B.100d.txt", "r", encoding="utf-8") as vector_file:
    for line in vector_file:
        line_content = line.split()
        word = line_content[0]
        # There's probably a better way to read strings into a FloatTensor
        word_vec = torch.from_numpy(np.asarray(line_content[1:], "float32"))
        GloVe[word] = word_vec

In [22]:
# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [36]:
test = ["I like them"] * 5

In [4]:
test = "I like them"

In [37]:
inputs = tokenizer(test, return_tensors="pt")
inputs

{'input_ids': tensor([[ 40, 588, 606],
        [ 40, 588, 606],
        [ 40, 588, 606],
        [ 40, 588, 606],
        [ 40, 588, 606]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}

In [40]:
outputs = model(**inputs, labels=inputs["input_ids"])
logits = outputs.logits

In [41]:
next_token_scores = logits[:, -1, :].softmax(dim=-1)

In [42]:
next_token_scores

tensor([[9.4995e-03, 3.1773e-03, 6.6273e-07,  ..., 2.7145e-08, 2.0966e-08,
         2.8246e-05],
        [9.4995e-03, 3.1773e-03, 6.6273e-07,  ..., 2.7145e-08, 2.0966e-08,
         2.8246e-05],
        [9.4995e-03, 3.1773e-03, 6.6273e-07,  ..., 2.7145e-08, 2.0966e-08,
         2.8246e-05],
        [9.4995e-03, 3.1773e-03, 6.6273e-07,  ..., 2.7145e-08, 2.0966e-08,
         2.8246e-05],
        [9.4995e-03, 3.1773e-03, 6.6273e-07,  ..., 2.7145e-08, 2.0966e-08,
         2.8246e-05]], grad_fn=<SoftmaxBackward0>)

In [43]:
sorted_vals, indices = torch.sort(next_token_scores)
sorted_vals

tensor([[9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01],
        [9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01],
        [9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01],
        [9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01],
        [9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01]], grad_fn=<SortBackward0>)

In [231]:
indices

tensor([[ 8994, 17629, 31204,  ...,   477,    11,    13]])

In [232]:
torch.sum(sorted_vals, dim=1)

tensor([1.0000], grad_fn=<SumBackward1>)

In [196]:
x = torch.stack((sorted_vals, indices), dim=1)

In [197]:
[top_p(tup, p=0.9) for tup in x]

[(tensor([0.1306, 0.1194, 0.0741, 0.0513, 0.0418, 0.0409, 0.0306, 0.0305, 0.0294,
          0.0285, 0.0244, 0.0228, 0.0207, 0.0207, 0.0130, 0.0121, 0.0102, 0.0095,
          0.0090, 0.0078, 0.0076, 0.0076, 0.0070, 0.0062, 0.0052, 0.0052, 0.0050,
          0.0040, 0.0039, 0.0038, 0.0038, 0.0037, 0.0035, 0.0034, 0.0033, 0.0032,
          0.0031, 0.0028, 0.0028, 0.0025, 0.0025, 0.0024, 0.0023, 0.0023, 0.0021,
          0.0020, 0.0019, 0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.0018, 0.0017,
          0.0017, 0.0017, 0.0017, 0.0016, 0.0016, 0.0015, 0.0015, 0.0015, 0.0014,
          0.0013, 0.0013, 0.0013, 0.0013, 0.0012, 0.0012, 0.0012, 0.0012, 0.0011,
          0.0011, 0.0010, 0.0009, 0.0009, 0.0008, 0.0008, 0.0008, 0.0008, 0.0008,
          0.0008, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0006,
          0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006,
          0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006, 0.0005, 0.0005,
          0.0005

In [72]:
import multiprocess as mp

In [100]:
def top_p(tup, p=0.9):
    sorted_vals, indices = tup
    trunc_sorted_vals = []
    sum_so_far = 0
    # reversed?
    for val in reversed(sorted_vals):
        sum_so_far += val
        trunc_sorted_vals.append(val)
        if sum_so_far > p:
            break
    sorted_vals = torch.FloatTensor(trunc_sorted_vals)
    indices = indices[-len(sorted_vals):]
    return sorted_vals, indices

In [165]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [88]:
import time
start = time.time()
with mp.Pool(5) as pool:
    z = pool.map(top_p, x)
end = time.time()
end-start

0.06428003311157227

In [94]:
def top_p(sorted_vals, indices, p):
    trunc_sorted_vals = []
    sum_so_far = 0
    # reversed?
    for val in reversed(sorted_vals):
        sum_so_far += val
        trunc_sorted_vals.append(val)
        if sum_so_far > p:
            break
    sorted_vals = torch.FloatTensor(trunc_sorted_vals)
    indices = indices[-len(sorted_vals):]
    return sorted_vals, indices

In [234]:
top_p(sorted_vals[:], indices[:], 0.9)

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [142]:
sorted_vals[-1:][:]

tensor([[9.3337e-13, 1.1713e-12, 1.2867e-12,  ..., 7.4071e-02, 1.1940e-01,
         1.3058e-01]], grad_fn=<SliceBackward0>)

In [144]:
sorted_vals[:, -9:]

tensor([[0.0294, 0.0305, 0.0306, 0.0409, 0.0418, 0.0513, 0.0741, 0.1194, 0.1306],
        [0.0294, 0.0305, 0.0306, 0.0409, 0.0418, 0.0513, 0.0741, 0.1194, 0.1306],
        [0.0294, 0.0305, 0.0306, 0.0409, 0.0418, 0.0513, 0.0741, 0.1194, 0.1306],
        [0.0294, 0.0305, 0.0306, 0.0409, 0.0418, 0.0513, 0.0741, 0.1194, 0.1306],
        [0.0294, 0.0305, 0.0306, 0.0409, 0.0418, 0.0513, 0.0741, 0.1194, 0.1306]],
       grad_fn=<SliceBackward0>)

In [95]:
def get_embeddings(sorted_vals, indices, top_embeddings):
    for word_idx in range(len(indices)):
        word = tokenizer.decode(indices[word_idx])
        if word.strip().lower() not in GloVe.keys():
            sorted_vals[word_idx] = 0  # disregard this token
            top_embeddings.append(GloVe['failure']) # TOFIX
        else:
            if word[1:].isalpha() or word.isalpha():
                top_embeddings.append(GloVe[word.strip().lower()])
            else:
                top_embeddings.append(GloVe[word.strip()])
    return top_embeddings

In [96]:
def generate_one(prompts, top_p):
    inputs = tokenizer(prompts, return_tensors="pt")
    outputs = model(**inputs, labels=inputs["input_ids"])
    logits = outputs.logits
    next_token_scores = logits[:, -1, :].softmax(dim=-1)
    sorted_vals, indices = torch.sort(next_token_scores)
    
    
    if top_p > 0:
        # Sequential
        x = zip(sorted_vals, indices)
        res = [top_p(tup, p=0.9) for tup in x]
        
        # Multiprocessing
        """
        with mp.Pool(5) as pool:
            res = pool.map(top_p, x)
        """
        
    else:
        # else, we just do top-k
        sorted_vals = sorted_vals[:, -top_k_val:]
        indices = indices[:, -top_k_val:]
        res = list(zip(sorted_vals.detach(), indices.detach()))
        
    top_embeddings = []

    for tup in res:
        new = []
        get_embeddings(tup[0], tup[1], new)
        top_embeddings.append(new)
        
    all_scores = []
    for prompt in top_embeddings:
        dist_score = [distance_score(embed, wb_embeddings, clusters, n_clusters) for embed in prompt]
        all_scores.append(dis_score)
        
    final_ranked_indices = []
    sorted_vals = []
    
    for i, tup in enumerate(res): 
        temp = rerank(tup[0], tup[1], all_scores[i])
        final_ranked_indices.append(temp[1][-SEARCH_SPACE_NUM:])
        sorted_vals.append(temp[2][-SEARCH_SPACE_NUM:])
    
    
    if top_p_val > 0:
        
        idx, norm_scores = sample_idx(sorted_vals[:])
        
    
        
    
        
    
        
    
    
        
    
    

SyntaxError: expected ':' (1363173589.py, line 32)

In [122]:
prompts = ["Hi there"] * 5

In [123]:
inputs = tokenizer(prompts, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
logits = outputs.logits
next_token_scores = logits[:, -1, :].softmax(dim=-1)
sorted_vals, indices = torch.sort(next_token_scores)

In [126]:
x = zip(sorted_vals, indices)
res = [top_p(tup, p=0.9) for tup in x]

In [128]:
top_embeddings = []
get_embeddings(res[0][0], res[0][1], top_embeddings)

TypeError: get_embeddings() takes 2 positional arguments but 3 were given

In [129]:
top_embeddings = []
    
for tup in res:
    new = []
    get_embeddings(tup[0], tup[1], new)
    top_embeddings.append(new)

TypeError: expected Tensor as element 0 in argument 0, but got zip

In [130]:
top_embeddings

[[tensor([-0.8028,  0.8250,  1.3844, -0.9616,  0.2254, -0.4348,  0.2530, -0.5236,
           0.2971, -0.0506,  0.1638, -0.4544,  0.3626,  0.2230,  0.3763, -0.5319,
          -0.0291,  0.4642, -0.5286,  0.1935,  0.8662,  0.9178,  0.3805,  0.7841,
           0.3984,  0.1749, -0.9661, -0.9013, -0.1527, -0.5995,  0.7019, -0.1805,
          -0.3948, -0.2389,  0.0319, -0.0933,  0.6410,  0.7903, -0.7418, -0.2871,
           0.0695, -0.3711,  0.2521, -0.3146, -0.3681, -0.0047, -0.4559, -0.6999,
           0.5577, -0.5440, -0.4708, -0.5593,  0.2780,  1.6203, -1.2591, -2.3143,
           0.0667,  0.0913,  1.5506,  0.0349, -0.5734,  0.8203,  0.7456, -1.0320,
           0.2456,  0.0899,  0.3529,  0.2577, -0.1680,  0.0872, -0.0389, -0.2063,
          -0.8773, -0.3230, -0.3081, -0.0748,  0.5029, -0.5205, -0.8123, -0.3095,
           1.5353, -0.4889,  0.2946, -0.7429, -1.9640, -0.0813, -0.0747,  0.4639,
           0.1714, -0.0093,  0.1878, -0.4220, -0.0744,  0.0377, -0.7361,  0.4739,
           0.316