In [2]:
import ast
import json
import glob
import pandas as pd
import re
import requests
import string

from pathlib import Path

# Load results

In [3]:
def create_new_df(filename):
    df = pd.read_json(filename).swapaxes("index","columns")
    
    # use integer indices and set words to new column
    df = df.reset_index()
    df = df.rename(columns={'index': 'word'})
    df['index'] = range(len(df))
    df = df.set_index('index')
    
    # convert T/F of 'predicted_correctly' to int
    df['predicted_correctly'] = df['predicted_correctly'].astype(int)
    
    return df 


def clean_result(df_row):
    result = df_row['result']
    result_str = result.lower()
    punctuations = string.punctuation.replace("-", "")
    clean_result = ''.join(char for char in result_str if char not in punctuations)
    clean_result = clean_result.split()
    return clean_result


# Infix-specific functions to use for df.apply()
def cr_fuckin(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if 'fuckin' in word][0]
    

def cr_iz(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if '-iz-' in word][0]


def cr_diddly(df_row):
    cr = clean_result(df_row)
    return [word for word in cr if 'diddly' in word][0]

In [52]:
expletive_results_path = 'results/text-davinci-003_results/fuckin/fuckin_5shot_v0.json'
# iz_results_path = 'results/text-davinci-003_results/iz/iz_3shot_v1.json'
# diddly_results_path = 'results/text-davinci-003_results/diddly/diddly_3shot_v1.json'

expletive_results = create_new_df(expletive_results_path)
# iz_results = create_new_df(iz_results_path)
# diddly_results = create_new_df(diddly_results_path)

In [55]:
expletive_results['clean_result'] = expletive_results.apply(cr_fuckin, axis=1)
# iz_results['clean_result'] = iz_results.apply(cr_iz, axis=1)
# diddly_results['clean_result'] = diddly_results.apply(cr_diddly, axis=1)

# Error 1: Does it infix?
- If you remove the infix from the output, is the input word retained?

In [53]:
def infix_at_edge(word, infix):
    pattern = r"(^(-?" + re.escape(infix) + "-?))|((-?" + re.escape(infix) + "-?)$)"
    return bool(re.search(pattern, word))

def infix_eval(infix, results_df):
    '''
    Checks if infix is inserted to the start or end of the test word.
    If not, checks if the model output retains integrity of the base word 
    after infixing (no extra deletions or insertions).
    Returns the percent accuracy of outputs that adhere to these rules.
    '''
    infix_no_dash = infix.replace('-', '')
    model_outputs = results_df['clean_result'].tolist()
    originals = results_df['word'].tolist()
    
    infix_results = []
    
    for i in range(len(results_df)):
        if infix_at_edge(model_outputs[i], infix_no_dash):
            infix_results.append(False)
        else:
            remove_infix = model_outputs[i].replace(infix, '')
            remove_infix_no_dash = model_outputs[i].replace(infix_no_dash, '')
            
            original = originals[i].strip().lower()
            
            does_infixation = (remove_infix == original) or (remove_infix_no_dash == original)
            
            infix_results.append(does_infixation)
        
    return sum(infix_results)/len(results_df) * 100

In [56]:
infix_eval('-fuckin-', expletive_results)

95.0

# Error 2: Does it respect syllable boundaries?
- Is it actually inserting the infix in between syllables (even if they're the wrong ones)?

In [None]:
# load relevant dataset NOTE: THESE ARE NOT THE RESULTS
expletive_path = 'data/infix_dataset/expletive_data.csv'
expletive_dataset = pd.read_csv(expletive_path, index_col=0)
full_dataset = pd.read_csv('data/syllable_data/all_syllable_data.csv')

In [None]:
def syllable_eval(infix, model_output, actual_syllables):
    '''
    Determines if infixation occurs between syllables.
    This method of evaluation only really applies to expletive and diddly infixation
    '''
    model_output = model_output.lower().split(infix)
    actual_syllables = [s.lower() for s in actual_syllables]
    
    if model_output[0] == actual_syllables[0]:
        return True
    else:
        concat = ''
        actual_syllables.append('') # padding for loop
        
        for s in actual_syllables:
            if model_output[0] == concat:
                return True
            else:
                concat += s
    return False

In [None]:
syllable_results = []
originals = expletive_results['word'].tolist()

for i in range(len(expletive_results)):
    try:
        row = full_dataset.loc[full_dataset['word']==originals[i].strip().upper()]
        result = expletive_results[i]
        correct_syllables = ast.literal_eval(full_dataset.loc[row.index[0], 'syllables'])
        correct_stress = ast.literal_eval(full_dataset.loc[row.index[0], 'stress_pattern'])

        syllable_results.append(syllable_eval('-fuckin-', result, correct_syllables))
        
    except IndexError:
        print("Word not found in dataset:", originals[i])

sum(syllable_results)      

# Error 3: Does it respect stress pattern?
- Even if it is inserting between syllables, is it doing it between the right ones?

In [None]:
def stress_eval(infix, model_output, stress_pattern):
    '''
    If the second half of the model output (the part following the infix) matches
    the concatenation of syllables following the primary stress, the model has 
    infixed at the right spot.
    This code assumes that primary stress is occurring internally and that there is only one.
    '''
    model_output = model_output.lower().split(infix)
    index = 0
    for i in range(len(stress_pattern)):
        if stress_pattern[i][1] == '1':
            index = i
            break
    postfix = stress_pattern[index:]
    postfix = [p[0].lower() for p in postfix]
    postfix = ''.join(postfix)
    
    # print(model_output[-1], postfix, stress_pattern)
    
    if model_output[-1] == postfix:
        return True
    else:
        return False      

In [None]:
stress_results = []

for i in range(len(expletive_results)):
    try:
        row = full_dataset.loc[full_dataset['word']==originals[i].strip().upper()]
        result = expletive_results[i]
        correct_stress = ast.literal_eval(full_dataset.loc[row.index[0], 'stress_pattern'])
        
        stress_results.append(stress_eval('-fuckin-', result, correct_stress))
        
    except IndexError:
        print("Word not found in dataset:", originals[i])
        
sum(stress_results)

# tiktoken

In [None]:
import tiktoken
# text-davinci-002 and 003 use p50k_base
encoding = tiktoken.encoding_for_model('text-davinci-003')

In [None]:
def get_token_bytes(word):
    token_integers = encoding.encode(word)
    token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
    return token_bytes

tiktoken_df = expletive_results[['word', 'correct_infix', 'clean_result', 'predicted_correctly']]
tiktoken_df['tokens'] = tiktoken_df['word'].apply(get_token_bytes)

In [None]:
tiktoken_df