In [2]:
import pandas as pd
import openai
import numpy as np
import time
import tiktoken
openai.api_key = ''
import concurrent.futures
import math


In [3]:
df = pd.read_csv('classified_data_complete_final.csv')
df["date"] = pd.to_datetime(df["date"])

In [4]:
df.head(3)

Unnamed: 0,date,title,text,governor,tokens,Relevant,clean
0,2023-05-24,"Hike, Skip, or Pause?","Thank you, Peter, and thank you for the opport...",Governor Christopher J. Waller,1000,Inflation,Inflation
1,2023-05-24,"Hike, Skip, or Pause?","is running too high. Likewise, narrower defin...",Governor Christopher J. Waller,1000,"Inflation, Interest Rates","Inflation, Interest Rates"
2,2023-05-24,"Hike, Skip, or Pause?","are three options: hike, skip, or pause. Let ...",Governor Christopher J. Waller,642,"The speech mentions inflation, interest rates,...","Inflation, Interest Rates, Economic Growth"


In [5]:
df_to_classify = df[df['clean'] != "No Topic"]

In [9]:
df_to_classify["score"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_classify["score"] = np.nan


In [17]:
# Function to make a single API call to OpenAI's API
def make_api_call(speech):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a knowledgeable assistant specialized in identifying sentiment in Federal Reserve Speeches. Please label each speech only with these 5 labels: Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish."},
                {"role": "user", "content": "'We believe inflation is temporary and we will keep interest rates low to support economic recovery.'\n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"},
                {"role": "assistant", "content": "Dovish"},
                {"role": "user", "content": "'The current economic situation may require us to maintain a lower interest rate for a while.'\n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"},
                {"role": "assistant", "content": "Mostly Dovish"},
                {"role": "user", "content": "'We're monitoring the situation carefully and will adjust monetary policy as needed.'\n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"},
                {"role": "assistant", "content": "Neutral"},
                {"role": "user", "content": "'Given the positive economic indicators, it may be necessary to start considering raising interest rates.'\n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"},
                {"role": "assistant", "content": "Mostly Hawkish"},
                {"role": "user", "content": "'The risk of inflation is high, it's critical to increase interest rates immediately.'\n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"},
                {"role": "assistant", "content": "Hawkish"},
                {"role": "user", "content": f"""Speech: "{speech}" \n Sentiment (Dovish, Mostly Dovish, Neutral, Mostly Hawkish, Hawkish):"""}],
            temperature=1,
            max_tokens=30
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"An error occurred while making API call: {e}")
        return None



def process_dataframe(df, x=20, y=2):
    """
    Processes the given DataFrame by making API calls in batches.
    
    :param df: DataFrame to process.
    :param x: Number of API calls to make at once.
    :param y: Time to wait between batches in minutes.
    """
    # Calculate the number of batches
    num_batches = math.ceil(len(df) / x)

    # Loop through the DataFrame in batches
    for i in range(num_batches):
        start_idx = i * x
        end_idx = start_idx + x
        
        # Select the rows where "Relevant" is NaN
        
        rows_to_update = df.iloc[start_idx:end_idx][df['score'].isna().iloc[start_idx:end_idx]]
        
        # Break the loop if there are no more NaN values
        if rows_to_update.empty:
            break

        # Making X API calls concurrently
        with concurrent.futures.ThreadPoolExecutor(max_workers=x) as executor:
            results = list(executor.map(make_api_call, rows_to_update['text']))

        # Insert the results into the 'Relevant' column of the DataFrame
        df.loc[rows_to_update.index, 'score'] = results
        
        # Wait for y minutes before the next batch
        df.to_csv("fixed.csv", index=False)
        print(f"currently classifying rows:{start_idx} to {end_idx}")
        time.sleep(y * 60)

    # Return the processed DataFrame
    return df

# Process the DataFrame


In [12]:
scored_df = process_dataframe(df_to_classify, x=20, y=0.8)

currently classifying rows:0 to 20
currently classifying rows:20 to 40
currently classifying rows:40 to 60
currently classifying rows:60 to 80
currently classifying rows:80 to 100
currently classifying rows:100 to 120
An error occurred while making API call: The server is overloaded or not ready yet.
currently classifying rows:120 to 140
currently classifying rows:140 to 160
currently classifying rows:160 to 180
currently classifying rows:180 to 200
currently classifying rows:200 to 220
currently classifying rows:220 to 240
currently classifying rows:240 to 260
currently classifying rows:260 to 280
currently classifying rows:280 to 300
currently classifying rows:300 to 320
An error occurred while making API call: The server is overloaded or not ready yet.
currently classifying rows:320 to 340
currently classifying rows:340 to 360
currently classifying rows:360 to 380
An error occurred while making API call: The server is overloaded or not ready yet.
currently classifying rows:380 to 40

In [30]:
scored_df = pd.read_csv("classified_data_complete_final_scored.csv")
scored_df.value_counts("score")

score
Neutral                                                                                                                                                                                          1151
Mostly Dovish                                                                                                                                                                                     497
Mostly Hawkish                                                                                                                                                                                    187
Dovish                                                                                                                                                                                             34
Hawkish                                                                                                                                                                                            32
Most

In [35]:
categories = ['Inflation', 'Interest Rates', 'Economic Growth', 'Employment', 'No Topic']

def clean_categories(row):
    # Check if the row is a string
    if isinstance(row, str):
        # Split the categories and strip whitespace
        row_categories = [category.strip() for category in row.split(',')]
        # Filter out any categories not in the acceptable list
        row_categories = [category for category in row_categories if category in categories]
        # If 'No Topic' is not the only category, remove it
        if 'No Topic' in row_categories and len(row_categories) > 1:
            row_categories.remove('No Topic')
        # Join the categories back together
        return ', '.join(row_categories)
    else:
        # If the row is not a string, return 'No Topic'
        return 'No Topic'

# Apply the function to the 'clean' column
scored_df['clean'][scored_df["clean"].isna()] = scored_df['clean'][scored_df["clean"].isna()].apply(clean_categories)

scored_df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scored_df['clean'][scored_df["clean"].isna()] = scored_df['clean'][scored_df["clean"].isna()].apply(clean_categories)


date        0
title       0
text        0
governor    0
tokens      0
Relevant    0
clean       0
score       0
dtype: int64

In [36]:
scored_df.to_csv("classified_data_final_scored_cleaned.csv", index=False)