In [142]:
import ast
import json
import glob
import pandas as pd
import re
import requests
import string

from pathlib import Path

# Load results

In [244]:
def clean_result(df_row):
    result = df_row['result']
    result_str = result.lower()
    punctuations = string.punctuation.replace("-", "")
    clean_result = ''.join(char for char in result_str if char not in punctuations)
    clean_result = clean_result.split()
    return clean_result

def cr_fuckin(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if 'fuckin' in word][0]
    
def cr_iz(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if '-iz-' in word][0]

def cr_diddly(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if 'diddly' in word][0]

def create_new_df(filename):
    df = pd.read_json(filename).swapaxes("index","columns")
    
    # use integer indices and set words to new column
    df = df.reset_index()
    df = df.rename(columns={'index': 'word'})
    df['index'] = range(len(df))
    df = df.set_index('index')
    
    # convert T/F of 'predicted_correctly' to int
    df['predicted_correctly'] = df['predicted_correctly'].astype(int)
    
    return df 

In [248]:
expletive_results_path = 'results/text-davinci-003_results/fuckin/fuckin_5shot_v0.json'
iz_results_path = 'results/text-davinci-003_results/iz/iz_3shot_v1.json'
diddly_results_path = 'results/text-davinci-003_results/diddly/diddly_3shot_v1.json'

expletive_results = create_new_df(expletive_results_path)
iz_results = create_new_df(iz_results_path)
diddly_results = create_new_df(diddly_results_path)

In [249]:
expletive_results['clean_result'] = expletive_results.apply(cr_fuckin, axis=1)
iz_results['clean_result'] = iz_results.apply(cr_iz, axis=1)
diddly_results['clean_result'] = diddly_results.apply(cr_diddly, axis=1)

In [255]:
iz_results

Unnamed: 0_level_0,word,num_examples,examples,result,correct_infix,predicted_correctly,logprobs,clean_result
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,ahead,3,"[[offal, off-iz-al], [aroma, ar-iz-oma], [work...",Ahead = A-iz-head,ah-iz-ead,0,"[{' ': -0.02799315, ' ': -3.717036, ' ': -6.5...",a-iz-head
1,ball,3,"[[throne, thr-iz-one], [lay, l-iz-ay], [shit, ...",'b-iz-all',b-iz-all,1,"[{' ': -0.018682674, ' ': -4.154232, ' ': -6....",b-iz-all
2,bank,3,"[[bats, b-iz-ats], [ball, b-iz-all], [straw, s...",The infixed word would be 'b-iz-ank'.,b-iz-ank,1,"[{' ': -0.019420315, ' ': -4.0985103, ' ': -6...",b-iz-ank
3,bar,3,"[[ahead, ah-iz-ead], [syrup, s-iz-yrup], [doub...",Answer: 'b-iz-ar',b-iz-ar,1,"[{' ': -0.020190492, ' ': -4.0340643, ' ': -6...",b-iz-ar
4,bark,3,"[[bank, b-iz-ank], [stars, st-iz-ars], [cocain...",B-iz-ark,b-iz-ark,1,"[{' ': -0.018716563000000002, ' ': -4.1590667,...",b-iz-ark
5,behave,3,"[[strong, str-iz-ong], [bar, b-iz-ar], [lake, ...",'beh-iz-ave',beh-iz-ave,1,"[{' ': -0.015786301000000003, ' ': -4.3116636,...",beh-iz-ave
6,behind,3,"[[bark, b-iz-ark], [behave, beh-iz-ave], [come...",'beh-iz-ind',beh-iz-ind,1,"[{' ': -0.02089729, ' ': -4.0209603, ' ': -6....",beh-iz-ind
7,beyond,3,"[[doll, d-iz-oll], [behind, beh-iz-ind], [meri...",be-iz-yond,bey-iz-ond,0,"[{' ': -0.024678594, ' ': -3.8205884, ' ': -6...",be-iz-yond
8,bitch,3,"[[beyond, bey-iz-ond], [black, bl-iz-ack], [do...",'b-iz-itch',b-izn-itch,0,"[{' ': -0.015076797000000001, ' ': -4.464945, ...",b-iz-itch
9,black,3,"[[damn, d-iz-amn], [bitch, b-izn-itch], [coinc...",'bl-iz-ack',bl-iz-ack,1,"[{' ': -0.019239651, ' ': -4.1069584, ' ': -6...",bl-iz-ack


# Error 1: Does it infix?
- If you remove the infix from the output, is the input word retained?

In [273]:
def infix_eval(infix, results_df):
    model_outputs = results_df['clean_result'].tolist()
    originals = results_df['word'].tolist()
    
    infix_results = []
    
    for i in range(len(results_df)):
        undone_infix = model_outputs[i].replace(infix, '')c
        
        original = originals[i].strip().lower()
        # print(original, model_outputs[i], undone_infix)
        infix_results.append(undone_infix == original)
        
    return sum(infix_results)/len(results_df) * 100

In [274]:
infix_eval('-iz-', iz_results)

97.0

In [240]:
fuckin_results = results['clean_result'].tolist()
base = results['word'].tolist()

infix_results = []

for i in range(len(fuckin_results)):
    undone_infix = fuckin_results[i].replace('-fuckin-', '')
    original = base[i].strip().lower()
    infix_results.append(undone_infix == original)
    
sum(infix_results)

96

# Error 2: Does it respect syllable boundaries?
- Is it actually inserting the infix in between syllables (even if they're the wrong ones)?

In [213]:
# load relevant dataset
expletive_path = 'data/infix_dataset/expletive_data.csv'
expletive_dataset = pd.read_csv(fuckin_dataset,index_col=0)
full_dataset = pd.read_csv('data/syllable_data/all_syllable_data.csv')

In [236]:
def syllable_eval(infix, model_output, actual_syllables):
    '''
    This method of evaluation only really applies to expletive and diddly infixation
    '''
    model_output = model_output.lower().split(infix)
    actual_syllables = [s.lower() for s in actual_syllables]
    
    if model_output[0] == actual_syllables[0]:
        return True
    else:
        concat = ''
        actual_syllables.append('') # padding for loop
        
        for s in actual_syllables:
            if model_output[0] == concat:
                return True
            else:
                concat += s
    return False

In [237]:
syllable_results = []

for i in range(len(fuckin_results)):
    try:
        row = full_dataset.loc[full_dataset['word']==base[i].strip().upper()]
        result = fuckin_results[i]
        correct_syllables = ast.literal_eval(full_dataset.loc[row.index[0], 'syllables'])
        correct_stress = ast.literal_eval(full_dataset.loc[row.index[0], 'stress_pattern'])

        syllable_results.append(syllable_eval('-fuckin-', result, correct_syllables))
        
    except IndexError:
        print("Word not found in dataset:", base[i])

sum(syllable_results)
        
        

86

# Error 3: Does it respect stress pattern?
- Even if it is inserting between syllables, is it doing it between the right ones?

In [200]:
def stress_eval(infix, model_output, stress_pattern):
    '''
    If the second half of the model output (the part following the infix) matches
    the concatenation of syllables following the primary stress, the model has 
    infixed at the right spot.
    This code assumes that primary stress is occurring internally and that there is only one.
    '''
    model_output = model_output.lower().split(infix)
    index = 0
    for i in range(len(stress_pattern)):
        if stress_pattern[i][1] == '1':
            index = i
            break
    postfix = stress_pattern[index:]
    postfix = [p[0].lower() for p in postfix]
    postfix = ''.join(postfix)
    
    # print(model_output[-1], postfix, stress_pattern)
    
    if model_output[-1] == postfix:
        return True
    else:
        return False      

In [242]:
stress_results = []

for i in range(len(fuckin_results)):
    try:
        row = full_dataset.loc[full_dataset['word']==base[i].strip().upper()]
        result = fuckin_results[i]
        correct_stress = ast.literal_eval(full_dataset.loc[row.index[0], 'stress_pattern'])
        
        stress_results.append(stress_eval('-fuckin-', result, correct_stress))
        
    except IndexError:
        print("Word not found in dataset:", base[i])
        
sum(stress_results)

59

# tiktoken

In [275]:
import tiktoken
# text-davinci-002 and 003 use p50k_base
encoding = tiktoken.encoding_for_model('text-davinci-003')

In [279]:
def get_token_bytes(word):
    token_integers = encoding.encode(word)
    token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
    return token_bytes

In [283]:
tiktoken_df = expletive_results[['word', 'correct_infix', 'clean_result', 'predicted_correctly']]
tiktoken_df['tokens'] = tiktoken_df['word'].apply(get_token_bytes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tiktoken_df['tokens'] = tiktoken_df['word'].apply(get_token_bytes)


In [284]:
tiktoken_df

Unnamed: 0_level_0,word,correct_infix,clean_result,predicted_correctly,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,advance,ad-fuckin-vance,ad-fuckin-vance,1,"[b'ad', b'vance']"
1,amalgamated,amalga-fuckin-mated,amal-fuckin-gamated,0,"[b'am', b'alg', b'am', b'ated']"
2,anticipatory,antici-fuckin-patory,an-fuckin-ticipatory,0,"[b'ant', b'icip', b'atory']"
3,authentic,au-fuckin-thentic,authe-fuckin-ntic,0,"[b'authent', b'ic']"
4,autopsy,au-fuckin-topsy,au-fuckin-topsy,1,"[b'aut', b'opsy']"
5,awake,a-fuckin-wake,awake-fuckin-,0,"[b'aw', b'ake']"
6,aware,a-fuckin-ware,a-fuckin-ware,1,[b'aware']
7,bacteria,bac-fuckin-teria,bacte-fuckin-ria,0,"[b'b', b'acteria']"
8,bagatelle,baga-fuckin-telle,baga-fuckin-tel,0,"[b'bag', b'atel', b'le']"
9,balloon,ba-fuckin-lloon,bal-fuckin-loon,0,"[b'ball', b'oon']"
