In [1]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
from lib import levenshtein_distance, normalize_sequence, optimal_replacement_policy, corrupt_sequence
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# reassign [unused0] at index 1 to [EMT] with add_special_tokens
print("initial tokenizer vocab size: ", len(tokenizer))
# relpace [unused0] with [EMT] in the vocab
tokenizer.add_special_tokens({'additional_special_tokens': ['[EMT]']})
model.resize_token_embeddings(len(tokenizer))



initial tokenizer vocab size:  30522


Embedding(30523, 768)

In [4]:
test_str = ["[EMT]"]
inputs = tokenizer(test_str, return_tensors="pt")
inputs

{'input_ids': tensor([[  101, 30522,   102]]), 'attention_mask': tensor([[1, 1, 1]])}

In [5]:
outputs = model(**inputs)


In [6]:
df = pd.read_csv("Tweets.csv")
texts = df["text"].tolist()
texts[:10]

['@VirginAmerica What @dhepburn said.',
 "@VirginAmerica plus you've added commercials to the experience... tacky.",
 "@VirginAmerica I didn't today... Must mean I need to take another trip!",
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "@VirginAmerica and it's a really big bad thing about it",
 "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
 '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)',
 '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP',
 "@virginamerica Well, I didn't…but NOW I DO! :-D",
 "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."]

In [7]:
EMT_ID = tokenizer.convert_tokens_to_ids("[EMT]")




def empty_tokenized_to_str(sequence):
    return tokenizer.decode(sequence[1:-1:2])
        
    


In [14]:
rand_text = np.random.choice(texts)
tokens = normalize_sequence(tokenizer.encode(rand_text)[1:-1], EMT_ID)
corrupted_tokens = corrupt_sequence(tokens, tokenizer.vocab_size, EMT_ID, 10)
print("original: ", empty_tokenized_to_str(tokens))
print("corrupted: ", empty_tokenized_to_str(corrupted_tokens))

original:  @ southwestair on hold with airline 45 min and counting. service is terrible!
corrupted:  southwestenneair on mir with airline min [unused30] counting. bois terrible 265


In [None]:
initial_sequence = np.random.choice(texts)
target_sequence = np.random.choice(texts)

print("INITIAL", initial_sequence)
print("TARGET", target_sequence)

initial_sequence = tokenizer(initial_sequence)["input_ids"][1:-1]
target_sequence = tokenizer(target_sequence)["input_ids"][1:-1]

distance = levenshtein_distance(initial_sequence, target_sequence)
print("DISTANCE", distance)
# print(tokenizer.decode(initial_sequence))
initial_sequence = normalize_sequence(initial_sequence, EMT_ID)
target_sequence = normalize_sequence(target_sequence, EMT_ID)

# print initial_sequence as string
# print(tokenizer.decode(initial_sequence))
# print(empty_tokenized_to_str(initial_sequence))

for i in range(100):
    labels = optimal_replacement_policy(initial_sequence, target_sequence, len(tokenizer), EMT_ID)
    labels += np.random.uniform(0, 0.1, labels.shape)
    new_ids = np.argmax(labels, axis=1)
    if not (new_ids == initial_sequence).all():
        replacement_id = np.random.choice(np.where(new_ids != initial_sequence)[0])
        initial_sequence[replacement_id] = new_ids[replacement_id]
        initial_sequence = normalize_sequence(initial_sequence, EMT_ID)
    print(i+1, empty_tokenized_to_str(initial_sequence))

    if (initial_sequence == target_sequence):
        print("DONE")
        break
    

