In [1]:
import random

import pandas as pd
import numpy as np

from lib import optimal_replacement_policy, levenshtein_distance, normalize_sequence, get_replacement_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = get_replacement_tokenizer("distilbert-base-uncased")
empty_id = tokenizer.convert_tokens_to_ids("[EMT]")

In [3]:
df = pd.read_csv("Tweets.csv")
text_samples = df["text"].tolist()
text_samples[:10]

['@VirginAmerica What @dhepburn said.',
 "@VirginAmerica plus you've added commercials to the experience... tacky.",
 "@VirginAmerica I didn't today... Must mean I need to take another trip!",
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "@VirginAmerica and it's a really big bad thing about it",
 "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
 '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)',
 '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP',
 "@virginamerica Well, I didn't…but NOW I DO! :-D",
 "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."]

In [5]:
initial_sequence = np.random.choice(text_samples)
target_sequence = np.random.choice(text_samples)

print("INITIAL", initial_sequence)
print("TARGET", target_sequence)

initial_sequence = tokenizer(initial_sequence)["input_ids"][1:-1]
target_sequence = tokenizer(target_sequence)["input_ids"][1:-1]

distance = levenshtein_distance(initial_sequence, target_sequence)
print("DISTANCE", distance)
# print(tokenizer.decode(initial_sequence))
initial_sequence = normalize_sequence(initial_sequence, empty_id)
target_sequence = normalize_sequence(target_sequence, empty_id)

# print initial_sequence as string
# print(tokenizer.decode(initial_sequence))
# print(empty_tokenized_to_str(initial_sequence))

for i in range(100):
    labels = optimal_replacement_policy(initial_sequence, target_sequence, len(tokenizer), empty_id)
    labels += np.random.uniform(0, 0.1, labels.shape)
    new_ids = np.argmax(labels, axis=1)
    if not (new_ids == initial_sequence).all():
        replacement_id = np.random.choice(np.where(new_ids != initial_sequence)[0])
        initial_sequence[replacement_id] = new_ids[replacement_id]
        initial_sequence = normalize_sequence(initial_sequence, empty_id)
    print(i+1, tokenizer.decode(initial_sequence[1:-1:2]))

    if (initial_sequence == target_sequence):
        print("DONE")
        break
    



INITIAL @AmericanAir I lost my (basket) ballbag on your plane
TARGET @united Gate agent hooked me up with alternate flights. If you have a way to PREVENT the constant issues, that would rock.
DISTANCE 25
[ 1.  1. 12. 13. 12. 13. 12. 13. 12. 13. 12. 13. 12. 13. 12. 13. 12. 13.
 12. 13. 12. 13. 12. 13. 12. 13. 12. 13. 12.]
1 @ americanair i lost my way basket ) ballbag on your plane
[ 1.  1.  9. 10.  9. 10.  9. 10.  9. 10.  9. 10.  9.  1.  3.  4.  3.  4.
  3.  4.  3.  4.  3.  4.  3.  4.  3.  4.  3.]
2 @ americanair i lost my have way basket ) ballbag on your plane
[1. 1. 7. 8. 7. 8. 7. 8. 7. 8. 7. 8. 7. 1. 1. 1. 3. 4. 3. 4. 3. 4. 3. 4.
 3. 4. 3. 4. 3. 4. 3.]
3 @ meir i lost my have way basket ) ballbag on your plane
[1. 1. 4. 1. 3. 4. 3. 4. 3. 4. 3. 4. 3. 1. 1. 1. 3. 4. 3. 4. 3. 4. 3. 4.
 3. 4. 3. 4. 3. 4. 3.]
4 @ meir i lost my have way basket constant ) ballbag on your plane
[1. 1. 4. 1. 3. 4. 3. 4. 3. 4. 3. 4. 3. 1. 1. 1. 2. 3. 2. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1.]
5 @ meir i