In [1]:
import os

# Change the current working directory
#os.chdir('/Users/robertocerina/Desktop/CSSci Semester IV/')

In [3]:
# Import Necessary Libraries
import numpy as np
import pandas as pd
import random

In [9]:
# Load Twitter Data for January 6th of 2020
df = pd.read_excel('../../data/test/1000_random_rows_24_07_2020.xlsx')

In [10]:
# Below follows an example of how to label data using the OpenAI API. 
# You can do this yourself by setting up an API key on the OpenAI website. 
# Here I'm just showing this as an example, no need for you to replicate this part. 

# !pip install openai==0.28.1

import openai
import time # for error handling

In [11]:
# API settings for Azure OpenAI
openai.api_type = "azure"
openai.api_base = "https://computationalsocialsciences.openai.azure.com/"
openai.api_version = "2023-07-01-preview"

system_prompt = "You will be asked to label a tweet according to some characteristics. Always answer with the name of the characteristic, follower by the label. E.g. Lexical Diversity: 5. Do not use any other language."
user_prompt = ('Lexical Diversity: Rate the variety and complexity of the vocabulary used in the tweet on a 7-point Likert scale (1 = very low diversity, 7 = very high diversity).\n'
               'Sentiment Analysis: Categorize the emotional tone of the tweet as either negative, neutral, or positive.\n'
               'Spelling and Grammar Quality: Assess the quality of spelling and grammar in the tweet on a 7-point Likert scale (1 = very poor quality, full of errors; 7 = excellent quality, no errors).\n'
               'Content Originality: Rate the originality of the content on a 7-point Likert scale (1 = completely unoriginal, likely plagiarized; 7 = highly original, unique content).\n'
               'Use of URLs: For URLs included in the tweet, classify the Tweet in one of the following categories: [`URL present + leads to reliable sources`, `URL present + leads to unreliable sources`,`URL present + reliability of sources not attainable`, `URL not present`]\n'
               'Hashtag Usage: Report on the use of hashtags in the tweet as one of the following categories: [`Yes - genuine usage`, `Yes - artificial usage`, `No`]\n'
               'Political Leanings: Provide an assessment on whether the Tweet`s content can be categorised as Liberal, Conservative, Independent or Cannot say.\n'
               'Bot Assessment: Provide an assessment on whether the tweet was likely written by a bot. Answer with Yes or No.\n'
               'Content Accuracy Assessment: Provide an assessment on a 7-point Likert representing the accuracy of the claims in the tweet, (1 = all claims in the tweet are verifiably inaccurate; 7 = all claims are true and verifiable).\n'
               'Conspiracy Assessment: Report on whether the tweet contributes to spread conspiracy theories [`Yes`,`No`,`Cannot say`] .\n'
               'Offensive Assessment: This tweet is offensive [`Yes`,`No`,`Cannot say`] .\n\n\n'
              'Tweet:')

# Function to parse the API response and return a dictionary
def parse_api_response(response):
    # Split the response by new lines
    lines = response.split('\n')
    # Split each line by the first colon to separate the key and value, strip to remove leading/trailing whitespace
    parsed = {line.split(':', 1)[0].strip(): line.split(':', 1)[1].strip() for line in lines if ':' in line}
    return parsed

In [1]:
for index, row in df.iterrows():
    tweet_text = row['text']
    
    # Shuffle user prompt tasks to avoid systematic order bias
    user_prompt_sections = user_prompt.split('\n\n')
    random.shuffle(user_prompt_sections)
    shuffled_user_prompt = '\n\n'.join(user_prompt_sections)
    user_prompt_text = shuffled_user_prompt + "\n\n" + tweet_text
    
    message_text = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_text}
    ]

    # Initialize a counter for the number of retries
    retries = 0
    max_retries = 30

    while retries < max_retries:
        try:
            # Calling the Azure OpenAI API
            completion = openai.ChatCompletion.create(
                engine="GPT3",
                messages=message_text,
                temperature=0,
                api_key='ADD_API',  
                stop=None
            )
            
            # If the call was successful, break out of the loop
            print('Completed call to API')
            break
        except openai.error.APIConnectionError as e:
            print(f"APIConnectionError: {e}. Retrying in 60 seconds...")
            time.sleep(60)  # Wait for 60 seconds before retrying
            retries += 1
            continue
        except openai.error.InvalidRequestError as e:
            # This assumes that InvalidRequestError is the correct exception for content moderation issues
            # You might need to adjust based on the actual exception and error message for content moderation
            print(f"Content moderation error: {e}. Skipping...")
            df.at[index, 'Offensive Assessment'] = 'Content Moderation'
            break  # Skip further processing for this row

    if retries >= max_retries:
        # Handle the case where max retries have been reached
        print("Max retries reached or content moderation issue detected. Moving to the next item.")
        continue

    # Proceed with processing if the API call was successful and no moderation issues were encountered
    if 'choices' in completion and len(completion['choices']) > 0:
        response_content = completion['choices'][0]['message']['content']
       
        # Parse the API response
        response_data = parse_api_response(response_content)
        
        # Update the DataFrame with the parsed data
        for key, value in response_data.items():
            if key not in df:
                df[key] = pd.NA
            df.at[index, key] = value
        
        # Print the updated DataFrame row to see the changes
        print(df.loc[index])
        print(df)
        print(index)
        
        # Optionally, save the DataFrame periodically or after each update
        df.to_csv('gpt_labels_random_1000.csv', index=False) 

NameError: name 'df' is not defined