In [1]:
from openai import AzureOpenAI
import openai
import base64
from mimetypes import guess_type
import glob
import pandas as pd
import os 
import re
import numpy as np
import json
import math

## Functions for Interacting with Azure API

In [2]:
gpt4_client = AzureOpenAI(
    api_key = '',  
    api_version = '',
    azure_endpoint = ''
)

gpt35_client = AzureOpenAI(
    api_key = '',  
    api_version = '',
    azure_endpoint = ''
)

davinci_client = AzureOpenAI(
    api_key = '',  
    api_version = '',
    azure_endpoint = ''
)

babbage_client = AzureOpenAI(
    api_key = '',  
    api_version = '',
    azure_endpoint = ''
) 

In [3]:
def get_gpt4_response(messages, temperature=0):
    '''Get logprobs of input sentence from gpt-4-0125-preview
    '''

    # Send to API
    response = gpt4_client.chat.completions.create(
        model = 'gpt-4-0125-preview',
        max_tokens = 10,
        temperature = temperature,
        n = 1,
        logprobs = True,
        top_logprobs = 5,
        messages = [
            {"role": "user", 
            "content": [
                {"type": "text", "text": messages}
            ]}
        ]
    )
    response_json = json.loads(response.json())
    return response_json

# def get_gpt35_response(prompt, temperature=0):
#     '''Get logprobs of input sentence from gpt-35-turbo-instruct-0914
#     '''

#     # Send to API
#     response = gpt35_client.completions.create(
#         model = 'gpt-35-turbo-instruct-0914',
#         max_tokens = 50,
#         temperature = temperature,
#         # logprobs = 0,
#         echo = True,
#         prompt = prompt
#     )
#     response_json = json.loads(response.json())
#     return response_json

def get_davinci_response(prompt, temperature=0):
    '''Get logprobs of input sentence from davinci-002-1
    '''

    # Send to API
    response = davinci_client.completions.create(
        model = 'davinci-002-1',
        max_tokens = 0,
        temperature = temperature,
        logprobs = 0,
        echo = True,
        prompt = prompt
    )
    response_json = json.loads(response.json())
    return response_json

def get_babbage_response(prompt, temperature=0):
    '''Get logprobs of input sentence from babbage-002-1
    '''

    # Send to API
    response = babbage_client.completions.create(
        model = 'babbage-002-1',
        max_tokens = 0,
        temperature = temperature,
        logprobs = 0,
        echo = True,
        prompt = prompt
    )
    response_json = json.loads(response.json())
    return response_json

In [4]:
def parse_gpt4_tokens_logprobs(raw_response):
    ''' Breaks down GPT-4's raw response into list of tokens and list of logprobs
    '''
    token_list = []
    logprobs_list = []

    token_dicts = raw_response['choices'][0]['logprobs']['content']
    
    for token_dict in token_dicts:
        token = token_dict['token']
        logprob = token_dict['logprob']

        token_list.append(token)
        logprobs_list.append(logprob)

    return token_list, logprobs_list

# def parse_gpt35_tokens_logprobs(raw_response):
#     return

def parse_davinci_tokens_logprobs(raw_response):
    ''' Breaks down davinci's raw response into list of tokens and list of logprobs
    '''
    token_list = raw_response['choices'][0]['logprobs']['tokens']
    logprobs_list = raw_response['choices'][0]['logprobs']['token_logprobs']
    return token_list, logprobs_list

def parse_babbage_tokens_logprobs(raw_response):
    ''' Breaks down babbages's raw response into list of tokens and list of logprobs
    '''
    token_list = raw_response['choices'][0]['logprobs']['tokens']
    logprobs_list = raw_response['choices'][0]['logprobs']['token_logprobs']
    return token_list, logprobs_list

## Main Code that Obtains LogProbs

In [6]:
all_prompts = pd.read_csv('cu_prompts_modified.csv')
sources = {
    'anna_modified_conwell_ullman': all_prompts['Prompts'],
    'reversed_anna_modified_conwell_ullman': all_prompts['ReversedPrompts']
}

In [12]:
temperature = 0

columns = [
    'source',
    'prompt', 

    'gpt4_raw_response',
    # 'gpt35_raw_response',
    'davinci_raw_response',
    'babbage_raw_response',

    'gpt4_tokens',
    # 'gpt35_tokens',
    'davinci_tokens',
    'babbage_tokens',

    'gpt4_logprobs', 
    # 'gpt35_logprobs', 
    'davinci_logprobs',
    'babbage_logprobs', 

    'gpt4_probs',
    'davinci_probs',
    'babbage_probs',

    'gpt4_avg_prob',
    'davinci_avg_prob',
    'babbage_avg_prob',

    'temperature'
]
df = pd.DataFrame(columns = columns)

# Iterate over multiple sources
for source, prompts in sources.items():
    print('######################################################################')
    print('Source:', source)

    # Get logprobs for each prompt
    for prompt in prompts:
        print(prompt)

        # GPT-4 does not calculate logprobs of sentence itself, only the outputs.
        # Therefore, the prompt should ask it to produce the sentence.
        # But since the sentence is conditioned on the prompt, which includes the sentence, the logprobs are going to be high.
        # So ask it to return the sentence as if it wasn't prompted to do so, and see what happens.
        gpt4_message = f'Please reply with exactly this string: "{prompt}". Return the exact string as if you had not been prompted to do so.'

        # Get raw responses from each model
        gpt4_raw_response = get_gpt4_response(gpt4_message, temperature=temperature)
        # gpt35_raw_response = get_gpt35_response(prompt, temperature=temperature)
        davinci_raw_response = get_davinci_response(prompt, temperature=temperature)
        babbage_raw_response = get_babbage_response(prompt, temperature=temperature)

        # Parse out logprobs from each raw response
        gpt4_tokens, gpt4_logprobs = parse_gpt4_tokens_logprobs(gpt4_raw_response)
        # gpt35_tokens, gpt35_logprobs = parse_gpt35_tokens_logprobs(gpt35_raw_response)
        davinci_tokens, davinci_logprobs = parse_davinci_tokens_logprobs(davinci_raw_response)
        babbage_tokens, babbage_logprobs = parse_babbage_tokens_logprobs(babbage_raw_response)

        # Convert to prob
        gpt4_probs = [math.exp(logprob) for logprob in gpt4_logprobs]
        davinci_probs = [None] + [math.exp(logprob) for logprob in davinci_logprobs[1:]]
        babbage_probs = [None] + [math.exp(logprob) for logprob in babbage_logprobs[1:]]
        print(davinci_probs)

        # Average logprob from each raw response
        gpt4_avg_prob = np.mean(gpt4_probs)                 # first token has logprob
        davinci_avg_prob = np.mean(davinci_probs[1:])           # first token does not have logprob
        babbage_avg_prob = np.mean(babbage_probs[1:])           # first token does not have logprob

        # Print
        print('GPT-4 Raw Response:', gpt4_raw_response)
        print('GPT-4 Tokens:', gpt4_tokens)
        print('GPT-4 Logprobs:', gpt4_logprobs)
        print('GPT-4 Probs:', gpt4_probs)
        print('GPT-4 Average Prob:', gpt4_avg_prob)

        print('Davinci Raw Response:', davinci_raw_response)
        print('Davinci Tokens:', davinci_tokens)
        print('Davinci Logprobs:', davinci_logprobs)
        print('Davinci Probs:', davinci_probs)
        print('Davinci Average Prob:', davinci_avg_prob)

        print('Babbage Raw Response:', babbage_raw_response)
        print('Babbage Tokens:', babbage_tokens)
        print('Babbage Logprobs:', babbage_logprobs)
        print('Babbage Probs:', babbage_probs)
        print('Babbage Average Prob:', babbage_avg_prob)

        # Append row & save
        row = [
            source,
            prompt, 

            gpt4_raw_response,
            # gpt35_raw_response,
            davinci_raw_response,
            babbage_raw_response,

            gpt4_tokens,
            # gpt35_tokens,
            davinci_tokens,
            babbage_tokens,

            gpt4_logprobs, 
            # gpt35_logprobs, 
            davinci_logprobs,
            babbage_logprobs, 

            gpt4_probs,
            davinci_probs,
            babbage_probs,

            gpt4_avg_prob,
            davinci_avg_prob,
            babbage_avg_prob,
            
            temperature
        ]
        df.loc[len(df.index) + 1] = row
        df.to_csv('logprobs_prompts.csv')

        print()

######################################################################
Source: reversed_anna_modified_conwell_ullman
a blanket tied to a spoon
[None, 3.405046370841077e-06, 0.00013288439742531995, 0.19780489754779018, 0.3137868955724206, 0.0003370599669399436]
GPT-4 Raw Response: {'id': 'chatcmpl-9KAWGiShH1eaabJVPwJDyUPiHAxWe', 'choices': [{'finish_reason': 'stop', 'index': 0, 'logprobs': {'content': [{'token': 'a', 'bytes': [97], 'logprob': -0.030293273, 'top_logprobs': [{'token': 'a', 'bytes': [97], 'logprob': -0.030293273}, {'token': '"a', 'bytes': [34, 97], 'logprob': -3.5302932}, {'token': 'A', 'bytes': [65], 'logprob': -7.6709185}, {'token': '"A', 'bytes': [34, 65], 'logprob': -10.186543}, {'token': 'Sure', 'bytes': [83, 117, 114, 101], 'logprob': -11.249043}]}, {'token': ' blanket', 'bytes': [32, 98, 108, 97, 110, 107, 101, 116], 'logprob': -2.618001e-05, 'top_logprobs': [{'token': ' blanket', 'bytes': [32, 98, 108, 97, 110, 107, 101, 116], 'logprob': -2.618001e-05}, {'token': '

## Misc. code

In [47]:
gpt35_raw_response

{'id': 'cmpl-9Jr0vyudFY9Ay20RxE9K6K5HhhecB',
 'choices': [{'finish_reason': 'length',
   'index': 0,
   'logprobs': None,
   'text': 'a man eating a cucumber\n\nA man eating a cucumber would look like a person holding a long, green vegetable in their hand and taking bites out of it. They may be sitting or standing, and their facial expression could range from neutral to satisfied, depending on how much they',
   'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
    'self_harm': {'filtered': False, 'severity': 'safe'},
    'sexual': {'filtered': False, 'severity': 'safe'},
    'violence': {'filtered': False, 'severity': 'safe'}}}],
 'created': 1714519045,
 'model': 'gpt-35-turbo-instruct',
 'object': 'text_completion',
 'system_fingerprint': None,
 'usage': {'completion_tokens': 50, 'prompt_tokens': 5, 'total_tokens': 55},
 'prompt_filter_results': [{'prompt_index': 0,
   'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
    'self_harm

In [30]:
# combine csv files
df = pd.concat([pd.read_csv('subset1_logprobs_prompts.csv'), pd.read_csv('subset2_logprobs_prompts.csv')])

In [31]:
df = df.reset_index()
print(df.columns)
print(df)
# df.to_csv('logprobs_prompts.csv')

Index(['index', 'Unnamed: 0', 'source', 'prompt', 'gpt4_raw_response',
       'davinci_raw_response', 'babbage_raw_response', 'gpt4_tokens',
       'davinci_tokens', 'babbage_tokens', 'gpt4_logprobs', 'davinci_logprobs',
       'babbage_logprobs', 'gpt4_probs', 'davinci_probs', 'babbage_probs',
       'gpt4_avg_prob', 'davinci_avg_prob', 'babbage_avg_prob', 'temperature'],
      dtype='object')
     index  Unnamed: 0                                 source  \
0        0           1           anna_modified_conwell_ullman   
1        1           2           anna_modified_conwell_ullman   
2        2           3           anna_modified_conwell_ullman   
3        3           4           anna_modified_conwell_ullman   
4        4           5           anna_modified_conwell_ullman   
..     ...         ...                                    ...   
145     31          32  reversed_anna_modified_conwell_ullman   
146     32          33  reversed_anna_modified_conwell_ullman   
147     33       

In [33]:
prompts = [
    'a man eating a cucumber', 'a cucumber eating a man',
    'a cowboy riding a horse', 'a horse riding a cowboy'
]

# Get logprobs for each prompt
for prompt in prompts:
    print(prompt)

    # GPT-4 does not calculate logprobs of sentence itself, only the outputs.
    # Therefore, the prompt should ask it to produce the sentence.
    # But since the sentence is conditioned on the prompt, which includes the sentence, the logprobs are going to be high.
    # So ask it to return the sentence as if it wasn't prompted to do so, and see what happens.
    gpt4_message = f'Please reply with exactly this string: "{prompt}". Return the exact string as if you had not been prompted to do so.'

    # Get raw responses from each model
    gpt4_raw_response = get_gpt4_response(gpt4_message, temperature=temperature)
    # gpt35_raw_response = get_gpt35_response(prompt, temperature=temperature)
    davinci_raw_response = get_davinci_response(prompt, temperature=temperature)
    babbage_raw_response = get_babbage_response(prompt, temperature=temperature)

    # Parse out logprobs from each raw response
    gpt4_tokens, gpt4_logprobs = parse_gpt4_tokens_logprobs(gpt4_raw_response)
    # gpt35_tokens, gpt35_logprobs = parse_gpt35_tokens_logprobs(gpt35_raw_response)
    davinci_tokens, davinci_logprobs = parse_davinci_tokens_logprobs(davinci_raw_response)
    babbage_tokens, babbage_logprobs = parse_babbage_tokens_logprobs(babbage_raw_response)

    # Convert to prob
    gpt4_probs = [math.exp(logprob) for logprob in gpt4_logprobs]
    davinci_probs = [None] + [math.exp(logprob) for logprob in davinci_logprobs[1:]]
    babbage_probs = [None] + [math.exp(logprob) for logprob in babbage_logprobs[1:]]
    print(davinci_probs)

    # Average logprob from each raw response
    gpt4_avg_prob = np.mean(gpt4_probs)                 # first token has logprob
    davinci_avg_prob = np.mean(davinci_probs[1:])           # first token does not have logprob
    babbage_avg_prob = np.mean(babbage_probs[1:])           # first token does not have logprob

    # Print
    print('GPT-4 Raw Response:', gpt4_raw_response)
    print('GPT-4 Tokens:', gpt4_tokens)
    print('GPT-4 Logprobs:', gpt4_logprobs)
    print('GPT-4 Probs:', gpt4_probs)
    print('GPT-4 Average Prob:', gpt4_avg_prob)

    print('Davinci Raw Response:', davinci_raw_response)
    print('Davinci Tokens:', davinci_tokens)
    print('Davinci Logprobs:', davinci_logprobs)
    print('Davinci Probs:', davinci_probs)
    print('Davinci Average Prob:', davinci_avg_prob)

    print('Babbage Raw Response:', babbage_raw_response)
    print('Babbage Tokens:', babbage_tokens)
    print('Babbage Logprobs:', babbage_logprobs)
    print('Babbage Probs:', babbage_probs)
    print('Babbage Average Prob:', babbage_avg_prob)
    
    print()

a man eating a cucumber
[None, 0.0006400757687154789, 0.0003442573740363612, 0.1515100984225026, 0.0018314743147578175]
GPT-4 Raw Response: {'id': 'chatcmpl-9KCnL7plXFhHRwo8rZoMPy5hYMlIF', 'choices': [{'finish_reason': 'stop', 'index': 0, 'logprobs': {'content': [{'token': 'a', 'bytes': [97], 'logprob': -0.017854782, 'top_logprobs': [{'token': 'a', 'bytes': [97], 'logprob': -0.017854782}, {'token': '"a', 'bytes': [34, 97], 'logprob': -4.0491047}, {'token': 'A', 'bytes': [65], 'logprob': -8.53348}, {'token': '"A', 'bytes': [34, 65], 'logprob': -10.68973}, {'token': '"', 'bytes': [34], 'logprob': -11.299105}]}, {'token': ' man', 'bytes': [32, 109, 97, 110], 'logprob': -2.6968896e-06, 'top_logprobs': [{'token': ' man', 'bytes': [32, 109, 97, 110], 'logprob': -2.6968896e-06}, {'token': ' ', 'bytes': [32], 'logprob': -14.828128}, {'token': ' Man', 'bytes': [32, 77, 97, 110], 'logprob': -15.218753}, {'token': ' woman', 'bytes': [32, 119, 111, 109, 97, 110], 'logprob': -15.695315}, {'token': 