In [14]:
!pip install openai



In [15]:
import pandas as pd
import openai
import time
from datetime import datetime

In [16]:
# Load your OpenAI API key
openai.api_key = 'YOUR_OPENAI_API_KEY_HERE'  # <- Paste your OpenAI key here

In [17]:
def dereference_pronouns_with_openai(text):
    """
    Function to use the OpenAI API for dereferencing pronouns.
    """

    print ("INPUT: ", text)
    # Use the chat model structure for the API call
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant that dereferences pronouns in text."},
            {"role": "user", "content": f"Dereference all pronouns in the following text without making any other changes: '{text}'"}
        ]
    )
    output_text= response.choices[0].message['content'].strip()
    print ("OUTPUT: ", output_text)
    print ()
    time.sleep(1) # Put in a 1 second delay to not overload the OpenAI servers?
    return output_text


In [18]:
# Define a function to apply the dereferencing and handle errors
def robust_dereference(row):
    try:
        # If the text is already processed, skip
        if pd.notna(row['Modified_Text']):
            return row['Modified_Text']
        # Process the text
        modified_text = dereference_pronouns_with_openai(row['Text'])
        row['Modified_Text'] = modified_text
        return modified_text
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        time.sleep(10)  # Add a delay before retrying
        return None


In [19]:
# Load the CSV file into a dataframe (adjust the path as necessary)
#df = pd.read_csv("/path_to_your_file/Your_CSV_File.csv")
df = pd.read_csv("https://raw.githubusercontent.com/barrycforever/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_QA_20220906.csv")

# QA dataset listed above, but examples below are from the class corpus of a particular quarter

In [20]:
# Don't experiment with all 200 rows of the dataframe the first time!
df = df.head(3)

In [21]:
df

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title
0,0,XOY_Doc1_Mean_Girls,`` In a wasteland of dumb movies about teenage...,XOY_Doc1_Mean_Girls,XOY,Comedy,Positive,Mean_Girls
1,1,XOY_Doc2_Mean_Girls,Although Mean Girls is likely to be frequently...,XOY_Doc2_Mean_Girls,XOY,Comedy,Negative,Mean_Girls
2,2,XOY_Doc3_Mean_Girls,She is immediately befriended by two of North ...,XOY_Doc3_Mean_Girls,XOY,Comedy,Negative,Mean_Girls


In [23]:
# Create a new column for modified text
df['Modified_Text'] = None

In [24]:
print ("Start:", datetime.now())
print ()

# Apply the function to the dataframe
df['Modified_Text'] = df.apply(robust_dereference, axis=1)

print ()
print ("Stop:", datetime.now())

Start: 2023-10-25 04:33:09.040541
INPUT:  `` In a wasteland of dumb movies about teenagers , `` Mean Girls '' is a smart and funny one . It even contains some wisdom , although I hesitate to mention that lest I scare off its target audience . The TV ads , which show Lindsay Lohan landing ass over teakettle in a garbage can , are probably right on the money ; since that scene is nothing at all like the rest of the movie , was it filmed specifically to use in the commercials ? Lohan stars as Cady Heron , a high school junior who was home-schooled in Africa while her parents worked there as anthropologists . She is therefore the smartest girl in school when her dad is hired by Northwestern and she enrolls in Evanston Township High School -- which , like all American high schools in the movies , is physically located in Toronto . What 's she 's not smart about are the ways cliques work in high school , and how you 're categorized and stereotyped by who you hang with and how you dress . Cad

In [25]:
df['Text'] = df['Modified_Text']
df.drop(columns=['Modified_Text'], inplace=True)

In [26]:
df

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title
0,0,XOY_Doc1_Mean_Girls,`` In a wasteland of dumb movies about teenage...,XOY_Doc1_Mean_Girls,XOY,Comedy,Positive,Mean_Girls
1,1,XOY_Doc2_Mean_Girls,Although Mean Girls is likely to be frequently...,XOY_Doc2_Mean_Girls,XOY,Comedy,Negative,Mean_Girls
2,2,XOY_Doc3_Mean_Girls,Janis (Lizzy Caplan) is immediately befriended...,XOY_Doc3_Mean_Girls,XOY,Comedy,Negative,Mean_Girls


In [27]:
# Save the modified dataframe to a new CSV file
#df.to_csv("/path_to_save/Modified_Movie_Reviews.csv", index=False)
#df.to_csv("MSDS453_ClassCorpus_Final_Sec57_v2_20230928.modified.csv")