In [74]:
import re
import difflib
import tiktoken

import pandas as pd
pd.set_option('display.max_columns', None) 

In [4]:
MODEL_NAME = "gpt-3.5"
enc = tiktoken.encoding_for_model(MODEL_NAME)

In [10]:
test_string = "Hello world! This is a tesssszst."

test_tok = enc.encode(test_string)

test_tok

[9906, 1917, 0, 1115, 374, 264, 80930, 784, 89, 267, 13]

In [13]:
byte_level = [enc.decode_single_token_bytes(token) for token in test_tok]

byte_level

[b'Hello',
 b' world',
 b'!',
 b' This',
 b' is',
 b' a',
 b' tess',
 b'ss',
 b'z',
 b'st',
 b'.']

In [14]:
string_level = [x.decode('utf-8') for x in byte_level]

string_level

['Hello', ' world', '!', ' This', ' is', ' a', ' tess', 'ss', 'z', 'st', '.']

In [28]:
s1 = "This is a really big string and I just keep tatatalking until I reach the end woooooow."
s2 = "This is a string and I just keep tatatalking until I change right before the end."

In [16]:
def get_tiktokens(s):
    tok = enc.encode(s)
    byte_level = [enc.decode_single_token_bytes(token) for token in tok]

    return [x.decode('utf-8') for x in byte_level]
    

In [20]:
l1 = get_tiktokens(s1)
l2 = get_tiktokens(s2)
l1,l2

(['This',
  ' is',
  ' a',
  ' string',
  ' and',
  ' I',
  ' just',
  ' keep',
  ' tat',
  'at',
  'alking',
  ' until',
  ' I',
  ' reach',
  ' the',
  ' end',
  '.'],
 ['This',
  ' is',
  ' a',
  ' string',
  ' and',
  ' I',
  ' just',
  ' keep',
  ' tat',
  'at',
  'alking',
  ' until',
  ' I',
  ' change',
  ' right',
  ' before',
  ' the',
  ' end',
  '.'])

In [21]:
for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
    print(match)

Match(a=0, b=0, size=13)
Match(a=14, b=16, size=3)
Match(a=17, b=19, size=0)


In [53]:
from random import randint

In [70]:
def equalize(s1, s2):
    l1 = get_tiktokens(s1)
    l2 = get_tiktokens(s2)
    
    # pretend generate some logprobs corresponding to each token
    l1_logprobs = [randint(1,100) for _ in range(len(l1))]
    l2_logprobs = [randint(1,100) for _ in range(len(l2))]
    
    res1 = []
    res2 = []

    l1_logprobs_padded = []
    l2_logprobs_padded = []
    
    prev = difflib.Match(0,0,0)
    for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (prev.a + prev.size != match.a):
            for i in range(prev.a + prev.size, match.a):
                #res2 += ['_' * len(l1[i])]
                res2 += ['-']
                l2_logprobs_padded += ['-']
            res1 += l1[prev.a + prev.size:match.a]
            l1_logprobs_padded += l1_logprobs[prev.a + prev.size:match.a]
        if (prev.b + prev.size != match.b):
            for i in range(prev.b + prev.size, match.b):
                #res1 += ['_' * len(l2[i])]
                res1 += ['-']
                l1_logprobs_padded += ['-']
            res2 += l2[prev.b + prev.size:match.b]
            l2_logprobs_padded += l2_logprobs[prev.b + prev.size:match.b]
        
        res1 += l1[match.a:match.a+match.size]
        res2 += l2[match.b:match.b+match.size]

        l1_logprobs_padded += l1_logprobs[match.a:match.a+match.size]
        l2_logprobs_padded += l2_logprobs[match.b:match.b+match.size]

        prev = match
    print(res1)
    print(l1_logprobs_padded)
    print(res2)
    print(l2_logprobs_padded)

    display(pd.DataFrame([res1, l1_logprobs_padded, res2, l2_logprobs_padded], index=["run1", "run1lp", "run2", "run2lp"]))

In [75]:
equalize(s1,s2)

['This', ' is', ' a', ' really', ' big', ' string', ' and', ' I', ' just', ' keep', ' tat', 'at', 'alking', ' until', ' I', ' reach', '-', '-', '-', ' the', ' end', ' w', 'oooo', 'o', 'ow', '.']
[6, 25, 99, 68, 10, 11, 33, 47, 35, 7, 5, 7, 45, 47, 45, 4, '-', '-', '-', 88, 56, 34, 46, 34, 4, 41]
['This', ' is', ' a', '-', '-', ' string', ' and', ' I', ' just', ' keep', ' tat', 'at', 'alking', ' until', ' I', '-', ' change', ' right', ' before', ' the', ' end', '-', '-', '-', '-', '.']
[3, 51, 84, '-', '-', 62, 4, 87, 77, 60, 36, 14, 10, 66, 31, '-', 58, 98, 31, 76, 53, '-', '-', '-', '-', 56]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
run1,This,is,a,really,big,string,and,I,just,keep,tat,at,alking,until,I,reach,-,-,-,the,end,w,oooo,o,ow,.
run1lp,6,25,99,68,10,11,33,47,35,7,5,7,45,47,45,4,-,-,-,88,56,34,46,34,4,41
run2,This,is,a,-,-,string,and,I,just,keep,tat,at,alking,until,I,-,change,right,before,the,end,-,-,-,-,.
run2lp,3,51,84,-,-,62,4,87,77,60,36,14,10,66,31,-,58,98,31,76,53,-,-,-,-,56


# Dataset for each run

- 5 dicts, each produces a list of tokens, and a list of logprob for that chosen token (both lists of same length therefore.)

**Below I am simulating this, just replace with real data**

I simulate by taking large string `s` then 4 others are obained by dropping some random indices (so that can see some alignment at least here)

In [78]:
s = "This is a really big string and I just keep tatatalking until I reach the end woooooow hello hello 141313 generating random reproductible why not is it ?"

tokens = get_tiktokens(s)
logprobs = [randint(1,15) for _ in range(len(tokens))]
len(tokens), len(logprobs)

(37, 37)

In [80]:
dataset_5_runs = {
    1: (tokens, logprobs),
}

In [82]:
import random

In [88]:
for run_number in range(2, 5+1):
    drop_indices = random.choices(range(len(tokens)), k=4)
    dataset_5_runs[run_number] = (
        [x for i,x in enumerate(tokens) if i not in drop_indices],
        [x for i,x in enumerate(logprobs) if i not in drop_indices],
    )

# Do pairwise sequence matching - use run 1 as the reference

Ask at work if there are standard methods for multi string alignment ?

In [123]:
def generate_tokens_sequence_alignment(l1, l1_logprobs, l2, l2_logprobs):
    
    res1 = []
    res2 = []

    l1_logprobs_padded = []
    l2_logprobs_padded = []
    
    prev_match = difflib.Match(0, 0, 0)
    for curr_match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (last_a := prev_match.a + prev_match.size) != curr_match.a:
            for i in range(last_a, curr_match.a):
                res2 += ['-']
                l2_logprobs_padded += ['-']
            res1 += l1[last_a:curr_match.a]
            l1_logprobs_padded += l1_logprobs[last_a:curr_match.a]
        
        if (last_b := prev_match.b + prev_match.size) != curr_match.b:
            for i in range(last_b, curr_match.b):
                res1 += ['-']
                l1_logprobs_padded += ['-']
            res2 += l2[last_b:curr_match.b]
            l2_logprobs_padded += l2_logprobs[last_b:curr_match.b]
        
        res1 += l1[curr_match.a:curr_match.a + curr_match.size]
        res2 += l2[curr_match.b:curr_match.b + curr_match.size]

        l1_logprobs_padded += l1_logprobs[curr_match.a:curr_match.a + curr_match.size]
        l2_logprobs_padded += l2_logprobs[curr_match.b:curr_match.b + curr_match.size]

        prev_match = curr_match

    return res1, l1_logprobs_padded, res2, l2_logprobs_padded

In [124]:
rows = []
indices = []
for run_num in range(1, 5+1):
    indices.extend([f"run_{run_num}", f"run_{run_num}_logprob"])

# do first two runs: 1 and 2
t1, lp1 = dataset_5_runs[1]
t2, lp2 = dataset_5_runs[2]

r1, lp1pad, r2, lp2pad = generate_tokens_sequence_alignment(t1, lp1, t2, lp2)

rows.extend([r1, lp1pad, r2, lp2pad])

# do runs 3,4,5 comparing to 1
for run_num in range(3, 5+1):
    t1, lp1 = dataset_5_runs[1]
    t, lp = dataset_5_runs[run_num]
    _, _, r, lppad = generate_tokens_sequence_alignment(t1, lp1, t, lp)
    rows.extend([r, lppad])
    

In [125]:
df = pd.DataFrame(rows, index=indices)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
run_1,This,is,a,really,big,string,and,I,just,keep,tat,at,alking,until,I,reach,the,end,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_1_logprob,5,2,12,1,13,3,8,8,2,10,9,7,6,2,11,10,4,5,11,2,15,15,4,14,4.0,2,10,3,13,5,4,12,8,5,14,13,8
run_2,This,is,a,-,big,string,and,I,just,keep,tat,-,alking,until,I,-,the,-,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_2_logprob,5,2,12,-,13,3,8,8,2,10,9,-,6,2,11,-,4,-,11,2,15,15,4,14,4.0,2,10,3,13,5,4,12,8,5,14,13,8
run_3,This,is,a,-,big,string,and,I,just,keep,tat,-,alking,until,I,reach,-,end,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,-
run_3_logprob,5,2,12,-,13,3,8,8,2,10,9,-,6,2,11,10,-,5,11,2,15,15,4,14,4.0,2,10,3,13,5,4,12,8,5,14,13,-
run_4,This,is,a,really,big,string,and,-,just,keep,tat,at,alking,-,I,reach,-,end,w,oooo,-,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_4_logprob,5,2,12,1,13,3,8,-,2,10,9,7,6,-,11,10,-,5,11,2,-,15,4,14,4.0,2,10,3,13,5,4,12,8,5,14,13,8
run_5,This,is,a,really,-,string,and,I,just,keep,tat,-,alking,until,I,reach,the,end,w,oooo,-,ow,hello,hello,,141,313,generating,random,-,product,ible,why,not,is,it,?
run_5_logprob,5,2,12,1,-,3,8,8,2,10,9,-,6,2,11,10,4,5,11,2,-,15,4,14,4.0,2,10,3,13,-,4,12,8,5,14,13,8


In [126]:
# show without logprobs

df1 = df.loc[df.index.isin(["run_1", "run_2", "run_3", "run_4", "run_5"])]

df1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
run_1,This,is,a,really,big,string,and,I,just,keep,tat,at,alking,until,I,reach,the,end,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_2,This,is,a,-,big,string,and,I,just,keep,tat,-,alking,until,I,-,the,-,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_3,This,is,a,-,big,string,and,I,just,keep,tat,-,alking,until,I,reach,-,end,w,oooo,o,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,-
run_4,This,is,a,really,big,string,and,-,just,keep,tat,at,alking,-,I,reach,-,end,w,oooo,-,ow,hello,hello,,141,313,generating,random,re,product,ible,why,not,is,it,?
run_5,This,is,a,really,-,string,and,I,just,keep,tat,-,alking,until,I,reach,the,end,w,oooo,-,ow,hello,hello,,141,313,generating,random,-,product,ible,why,not,is,it,?
