In [None]:
import os
from openai import OpenAI
import pandas as pd
import ast 
from datetime import date

# Working Dir.
os.chdir('/Users/fogellmcmuffin/Documents/ra/team_discussions/AI/')
os.environ['OPENAI_API_KEY'] = 'sk-proj-BBrbzGnsFwklndehaTC1T3BlbkFJEGPQt0QfhWkwp9ePuxaK'

In [None]:
####################
## Model Settings ##
####################

# Calling for OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
    organization='org-WLFAmqjnKmywM0wd6loMyGJq',    # RA_WORK
    project='proj_vOr6WeeCFk5IjZLCdksFLWUd',    # IRPD_CODING
)

# Model Settings
MODEL = 'gpt-4o-2024-05-13'
TEMPERATURE = 0
MAX_TOKENS = 1300
TOP_P = 1
FREQUENCY_PENALTY = 0
PRESENCE_PENALTY = 0

In [None]:
###############
## Functions ##
###############

## General/Help functions
def file_to_string(file_path):  # File read-to-string functions
  with open(file_path, 'r') as file:
    k = file.read()
  return str(k)


def write_file(file_path, file_write):  # File write function
  with open(file_path, 'w') as file:
    file.write(file_write)     


def test_info(test, data_name):  # Function to get test info
  info = str(
    'ChatGPT Model Information:' + '\n' +
    'format: OpenAI API' + '\n' +
    'model: ' + str(MODEL) + '\n' +
    'temperature: ' + str(TEMPERATURE) + '\n' +
    'max tokens: ' + str(MAX_TOKENS) + '\n' +
    'top p: ' + str(TOP_P) + '\n' +
    'frequency penalty: ' + str(FREQUENCY_PENALTY) + '\n' +
    'presence penalty: ' + str(PRESENCE_PENALTY) + '\n\n' +
    'Test Information:' + '\n' + 
    'test: ' + test + '\n' +
    'data: ' + data_name + '\n' +
    'date: ' + str(date.today()) + '\n'
  )
  return info


def get_test_dir(test_type='test', cycle=False):  # Function to get test directory
  if test_type == 'test':
    test_path = 'output/'
    tests = [i for i in os.listdir(test_path) if i.startswith('test')]
    
    test_numbers = [int(k[5:]) for k in tests]
    new_test_number = max(test_numbers) + 1 if cycle == False else max(test_numbers)
    
    test_dir = f"test_{new_test_number}"
  elif test_type == 'subtest':
    subtest_path = 'output/_subtests/'
    
    subtests = [int(k) for k in os.listdir(subtest_path) if k.isdigit()]
    new_test_number = max(subtests) + 1 if cycle == False else max(subtests)
    
    test_dir = str(new_test_number)
  return test_dir


## Final output functions
def extract_dict_from_file(file_path):  # Extracting info from GPT response text files
    with open(file_path, 'r', encoding='utf-8') as file:    # Opening response file
        lines = file.readlines()
    
    text = ''.join(lines)
    
    # Extracting dictionary
    start = text.find('{')
    end = text.find('}') + 1
    dict_text = text[start:end]
    
    cat_dict = ast.literal_eval(dict_text) # Turning dictionary string into python dictionary
    
    for i in cat_dict['assigned_categories']:  # Making a binary key for each assigned category
        cat_dict[i] = 1
    
    # Extracting GPT reasoning
    start_keyword = "Step-by-step Reasoning: "
    end_keyword = "Python Dictionary:"

    start_index = text.find(start_keyword) + len(start_keyword)
    end_index = text.find(end_keyword)
    reasoning = text[start_index:end_index].strip()
    
    data_dict = {}
    data_dict['gpt_reasoning'] = reasoning
    
    for key, value in cat_dict.items(): # Making sure gpt_reasoning is the first key
        data_dict[key] = value
    
    return data_dict


def response_df(response_dir, test_df):  # Turning dictionary list into GPT coded dataframe
    resp_list = []
    
    for file in os.listdir(response_dir):
        file_path = os.path.join(response_dir, file)
        reponse_dict = extract_dict_from_file(file_path)
        resp_list.append(reponse_dict)
    
    df = pd.DataFrame.from_records(resp_list)
    df = df.drop(['assigned_categories'], axis=1)
    df = df.fillna(0)
    
    df = pd.merge(test_df, df, on='window_number', how='outer')
    
    return df


def ucoop_udef_rename(df, prefix):  # Function to add ucoop or udef prefix to created category columns
    remove_columns = ['summary', 'unilateral_cooperation', 'window_number', 'gpt_reasoning']
    df_dropped = df.drop(columns=remove_columns)
    category_columns = df_dropped.columns.to_list()
    
    rename_dict = {col: f'{prefix}_{col}' for col in category_columns}
    df = df.rename(columns=rename_dict)
        
    return df


## Prompt request functions
def GPT_response(sys, user):  # Simple GPT response function
  # Requesting chat completion
  response = client.chat.completions.create(
    model=MODEL,
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    top_p=TOP_P,
    frequency_penalty=FREQUENCY_PENALTY,
    presence_penalty=PRESENCE_PENALTY,
    messages=[
      {"role": "system", "content": str(sys)},
      {"role": "user", "content": str(user)}
    ]
  )
  output = response.choices[0].message.content  # GPT response var
  
  return output


def stage_1_output(treatment, test_type='test'):  # Stage 1 function: Creating categories
  # System prompts
  sys_ucoop = file_to_string(file_path=f'prompts/ucoop/{treatment}/sys_1_{treatment}_ucoop.md')
  sys_udef = file_to_string(file_path=f'prompts/udef/{treatment}/sys_1_{treatment}_udef.md')
  
  # Summary data (User prompts)
  df_ucoop = pd.read_csv(f'test_data/RAsum_{treatment}_ucoop.csv')
  df_udef = pd.read_csv(f'test_data/RAsum_{treatment}_udef.csv')
  
  df_ucoop['window_number'] = df_ucoop['window_number'].astype(int)   # Making sure window number is an integer
  df_udef['window_number'] = df_udef['window_number'].astype(int)
  
  user_ucoop = str(df_ucoop.to_dict('records')) # Turning data into a list of dictionaries, then to string
  user_udef = str(df_udef.to_dict('records'))
  
  # Aggregating prompts
  window_prompts = [['ucoop', sys_ucoop, user_ucoop], ['udef', sys_udef, user_udef]]
  
  # Making test directory
  if test_type == 'test':
    test = get_test_dir()
    test_dir = os.path.join('output/', test)
    info_path = os.path.join(test_dir, f't{test[5:]}_test_info.txt')  # Test info path
  elif test_type == 'subtest':
    test = get_test_dir(test_type='subtest')
    test_dir = os.path.join('output/_subtests/', test)
    info_path = os.path.join(test_dir, f'{test}__subtest_info.txt')  # Test info path
  
  os.makedirs(test_dir, exist_ok=False)
  
  # Test info
  info = test_info(test=f"Test {test[5:]}" if test_type == 'test' else f"Subtest {test}", data_name=f'RAsum_{treatment}_ucoop.csv & RAsum_{treatment}_udef.csv')
  write_file(file_path=info_path, file_write=info)
  
  # GPT requests
  for i in window_prompts:  # Requests for both ucoop and udef instances
    inst_dir = os.path.join(test_dir, f'stage_1_{i[0]}') # Creating ind. instance directory
    os.makedirs(inst_dir, exist_ok=False)
    
    # Prompts
    sys_prmpt = i[1]
    user_prmpt = i[2]
    
    # GPT request output
    output = GPT_response(sys=sys_prmpt, user=user_prmpt)
    
    # Creating paths for prompts & GPT response
    if test_type == 'test':
      sys_prmpt_path = os.path.join(inst_dir, f't{test[5:]}_stg_1_{i[0]}_sys_prmpt.txt')
      user_prmpt_path = os.path.join(inst_dir, f't{test[5:]}_stg_1_{i[0]}_user_prmpt.txt')
      response_path = os.path.join(inst_dir, f't{test[5:]}_stg_1_{i[0]}_response.txt')
    elif test_type == 'subtest':
      sys_prmpt_path = os.path.join(inst_dir, f'{test}_stg_1_{i[0]}_sys_prmpt.txt')
      user_prmpt_path = os.path.join(inst_dir, f'{test}_stg_1_{i[0]}_user_prmpt.txt')
      response_path = os.path.join(inst_dir, f'{test}_stg_1_{i[0]}_response.txt')
    
    # Writing .txt files for prompts & GPT response
    write_file(file_path=sys_prmpt_path, file_write=sys_prmpt)
    write_file(file_path=user_prmpt_path, file_write=user_prmpt)
    write_file(file_path=response_path, file_write=str(output))
  
  return print("Stage 1 Complete")


def stage_2_output(treatment, max_windows=None, test_type='test'): # Stage 2 function: Assigning categories to individual summaries (recursive)
  # System Prompts
  sys_ucoop = file_to_string(file_path=f'prompts/ucoop/{treatment}/sys_2_{treatment}_ucoop.md')
  sys_udef = file_to_string(file_path=f'prompts/udef/{treatment}/sys_2_{treatment}_udef.md')
  
  # Summary data (User prompts)
  df_ucoop = pd.read_csv(f'test_data/RAsum_{treatment}_ucoop.csv')
  df_udef = pd.read_csv(f'test_data/RAsum_{treatment}_udef.csv')
  
  df_ucoop['window_number'] = df_ucoop['window_number'].astype(int)   # Making sure window number is an integer
  df_udef['window_number'] = df_udef['window_number'].astype(int)
  
  df_ucoop = df_ucoop[:max_windows] if max_windows != None else df_ucoop  # Adjusting to max windows for Stage 2
  df_udef = df_udef[:max_windows] if max_windows != None else df_udef
  
  # Aggregating prompts
  window_prompts = [['ucoop', sys_ucoop, df_ucoop], ['udef', sys_udef, df_udef]]
  
  # Getting test directory
  if test_type == 'test':
    test = get_test_dir(cycle=True)
    test_dir = os.path.join('output/', test)
  elif test_type == 'subtest':
    test = get_test_dir(test_type='subtest', cycle=True)
    test_dir = os.path.join('output/_subtests/', test)
  
  for i in window_prompts:
    # Requests for both ucoop and udef instances
    inst_dir = os.path.join(test_dir, f'stage_2_{i[0]}') # Creating ind. instance directory
    os.makedirs(inst_dir, exist_ok=False)
    
    sys_prmpt = i[1]    # System prompt for ucoop or udef data
    sys_prmpt_path = os.path.join(inst_dir, f't{test[5:]}_stg_2_{i[0]}_sys_prmpt.txt') if test_type == 'test' else os.path.join(inst_dir, f'{test}_stg_2_{i[0]}_sys_prmpt.txt')
    write_file(file_path=sys_prmpt_path, file_write=sys_prmpt)
    
    # Prompt & Response paths
    prompt_path = os.path.join(inst_dir, 'prompts')
    response_path = os.path.join(inst_dir, 'responses')
    os.makedirs(prompt_path, exist_ok=True)
    os.makedirs(response_path, exist_ok=True)
    
    # Requesting chat completion for each row
    df = i[2]   # Test data for ucoop or udef data
    for k in range(len(df)):
      row = df.iloc[k].to_dict()  # Creating a dictionary for each indv. row
      
      output = GPT_response(sys_prmpt, str(row))  # GPT request output
      
      # Creating paths for prompts & GPT responses using window_numbers
      window_number = row['window_number']
      
      if test_type == 'test':
        user_prmpt_path = os.path.join(prompt_path, f't{test[5:]}_{window_number}_user_prmpt.txt')
        output_path = os.path.join(response_path, f't{test[5:]}_{window_number}_response.txt')
      elif test_type == 'subtest':
        user_prmpt_path = os.path.join(prompt_path, f'{test}_{window_number}_user_prmpt.txt')
        output_path = os.path.join(response_path, f'{test}_{window_number}_response.txt')
      
      # Writing .txt files for prompts & GPT response
      write_file(user_prmpt_path, str(row))
      write_file(output_path, str(output))
    
    # Prelimaries to final output
    if i[0] == 'ucoop':
      df['unilateral_cooperation'] = 1
      ucoop_df = response_df(response_dir=response_path, test_df=df)  # Coding GPT classifications for ucoop instances
      ucoop_df = ucoop_udef_rename(ucoop_df, 'ucoop') # Adding ucoop prefix to categories
    else:
      df['unilateral_cooperation'] = 0
      udef_df = response_df(response_dir=response_path, test_df=df)   # Coding GPT classifications for udef instances
      udef_df = ucoop_udef_rename(udef_df, 'udef')    # Adding udef prefix to categories
  
  # Final output dataframe
  GPT_df = pd.concat([ucoop_df, udef_df], ignore_index=True, sort=False)
  GPT_df = GPT_df.fillna(0)
  final_out_path = os.path.join(test_dir, f"t{test[5:]}_final_output.csv" if test_type == 'test' else f"{test}_final_output.csv")
  GPT_df.to_csv(final_out_path, index=False)
  
  return print("Stage 2 Complete")

In [None]:
##############
## Requests ##
##############

## Stage 1
stage_1_output(treatment='noise', test_type='subtest')

In [None]:
## Stage 2
stage_2_output(treatment='noise', test_type='subtest', max_windows=5)