In [6]:
import os
import json
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd

In [7]:
# Function to initialize OpenAI
def initialize_openai():
    OpenAI.api_key = os.getenv("OPENAI_API_KEY")
    return openai

# Function to get OpenAI instructions
def get_openai_instructions():
    return "Label the overall economic sentiment in the following Beige Book text as negative, mixed, or positive. Return the label as a JSON value with a key of 'label' and nothing else:"

In [8]:
# Function to generate OpenAI completion
def generate_openai_completion(client, sentence, instructions):
    completion = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": sentence}
        ],
        seed=1985,
        response_format={ "type": "json_object" }
    )
    return completion.choices[0].message

# Function to process a single sentence
def process_sentence(sentence):
    client = initialize_openai()
    instructions = get_openai_instructions()
    return generate_openai_completion(client, sentence, instructions)

# Function to process multiple sentences
def process_sentences(sentences):
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(process_sentence, sentences))
    return results

# Function to calculate means for each topic
def calculate_topic_means(sentiment_scores):
    topic_means = {}
    for topic, scores in sentiment_scores.items():
        non_none_scores = [score for score in scores if score is not None]
        if non_none_scores:
            topic_means[topic] = np.mean(non_none_scores)
        else:
            topic_means[topic] = None
    return topic_means

In [9]:
from langchain_community.document_loaders import TextLoader # For loading text documents for use in LLMs
from langchain_community.document_loaders import DirectoryLoader # For loading directories of text documents for use in LLMs
import magic # This is needed for some reason for the DirectoryLoader to work
import pandas as pd
import json

# Do that weird "r" thing because paths are weird in Windows
path = r"C:\Users\MCOB PHD 14\Dropbox\Charlie's Dissertation\Beige Books\test_texts"

# Give it the path and tell it to look for .txt files
loader = DirectoryLoader(path, glob="**/*.txt")

# Read in all the documents from that directory as a list
docs = loader.load()


In [10]:
import pandas as pd
import openai

# Placeholder function to process a document and return a single label.
def process_document(document_content):
    # Here, process the whole document content and return a sentiment label.
    # For example, if you have a model or an API call, it can be used here.
    # Assuming the function returns 'positive', 'negative', or 'neutral'.
    document_label = process_sentence(document_content)  # Replace with actual function call
    return document_label

# Initialize list to store results
document_results = []

# Process each document in `docs`
for document in docs:
    # Extract Beige Book name and content from the document
    beige_book = document.metadata['source'].split("\\")[-1]  # Extract file name from path
    page_content = document.page_content

    # Process the entire document to get a single sentiment label
    document_label = process_document(page_content)
    
    # Append document-level result
    document_results.append({
        'Beige_Book': beige_book,
        'Document_Label': document_label
    })
    
    # Print progress
    print(f"Processed Beige Book: {beige_book}")

# Create DataFrame with document-level results
dfOpenAI_labels = pd.DataFrame(document_results)

# Save DataFrame to CSV
dfOpenAI_labels.to_csv('dfOpenAI_labels.csv', index=False)

print("All documents processed and saved to 'dfOpenAI_labels.csv'")


Processed Beige Book: 1970_ch (1)_chunk_4.txt
Processed Beige Book: 1970_cl (6)_chunk_1.txt
Processed Beige Book: 1970_ny (1)_chunk_3.txt
Processed Beige Book: 1971_kc (12)_chunk_2.txt
Processed Beige Book: 1972_at (9)_chunk_2.txt
Processed Beige Book: 1972_kc (5)_chunk_1.txt
Processed Beige Book: 1972_ny (10)_chunk_2.txt
Processed Beige Book: 1973_cl (10)_chunk_2.txt
Processed Beige Book: 1973_cl (12)_chunk_3.txt
Processed Beige Book: 1973_cl (2)_chunk_3.txt
Processed Beige Book: 1973_sl (5)_chunk_4.txt
Processed Beige Book: 1974_at (1)_chunk_5.txt
Processed Beige Book: 1975_at (8)_chunk_4.txt
Processed Beige Book: 1975_bo (5)_chunk_3.txt
Processed Beige Book: 1976_da (9)_chunk_2.txt
Processed Beige Book: 1976_ns (7)_chunk_4.txt
Processed Beige Book: 1976_ny (6)_chunk_3.txt
Processed Beige Book: 1976_ph (7)_chunk_1.txt
Processed Beige Book: 1976_ri (4)_chunk_1.txt
Processed Beige Book: 1976_sf (3)_chunk_4.txt
Processed Beige Book: 1977_ch (12)_chunk_3.txt
Processed Beige Book: 1977_cl

In [24]:
# Load the sentiment scores CSV
excel_path = r"C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/manual_sentiment.csv"
human_df = pd.read_csv(excel_path)

# Define the label function
def label_sentiment(score):
    if score <= -0.3:
        return 0  # Negative
    elif score <= 0.2:
        return 1  # Mixed
    else:
        return 2  # Positive

# Apply the label function to the sentiment scores
human_df['label'] = human_df['human_sentiment'].apply(label_sentiment)

# convert label numbers to be negative, mixed, or positive
human_df['label'] = human_df['label'].replace({0: 'negative', 1: 'mixed', 2: 'positive'})

human_df.head()

Unnamed: 0,Document,file_names,human_sentiment,scorer,label
0,1,1970_at (7)_chunk_1.txt,-0.9,CS,negative
1,2,1970_bo (4)_chunk_2.txt,0.2,CS,mixed
2,3,1970_ch (1)_chunk_4.txt,-0.5,CS,negative
3,4,1970_ch (5)_chunk_2.txt,-0.7,CS,negative
4,5,1970_ch (7)_chunk_2.txt,-0.5,CS,negative


In [25]:
# Ran in 1 minutes and 34.8 seconds
    # The cost was $0.15

import json

# Function to extract label from the ChatCompletionMessage content
def extract_label(chat_message):
    try:
        # Extract the content attribute directly
        content = chat_message.content
        # Parse the JSON string
        content_dict = json.loads(content)
        # Return the label value
        return content_dict['label']
    except (json.JSONDecodeError, KeyError, AttributeError) as e:
        # Handle potential errors and return None or a default value
        print(f"Error extracting label: {e}")
        return None

# Apply the function to the Document_Label column to create a new column
dfOpenAI_labels['Sentiment_Label'] = dfOpenAI_labels['Document_Label'].apply(extract_label)

# Need to join dfOpenAI_labels to human_df keeping only the rows that are in both

# Merge the two DataFrames on the 'Beige_Book' column in dfOpenAI_labels and 'file_names' column in human_scores_df
# Just keep the label column from human_df and the Sentiment_Label column from dfOpenAI_labels
merged_df = pd.merge(dfOpenAI_labels, human_df[['file_names', 'label']], left_on='Beige_Book', right_on='file_names', how='inner')

# Rename the 'label' column to 'Human_Label'
merged_df.rename(columns={'label': 'Human_Label'}, inplace=True)


Unnamed: 0,Beige_Book,Document_Label,Sentiment_Label
0,1970_ch (1)_chunk_4.txt,"ChatCompletionMessage(content='{""label"": ""nega...",negative
1,1970_cl (6)_chunk_1.txt,"ChatCompletionMessage(content='{""label"":""mixed...",mixed
2,1970_ny (1)_chunk_3.txt,"ChatCompletionMessage(content='{""label"":""negat...",negative
3,1971_kc (12)_chunk_2.txt,"ChatCompletionMessage(content='{""label"": ""mixe...",mixed
4,1972_at (9)_chunk_2.txt,"ChatCompletionMessage(content='{""label"": ""mixe...",mixed


In [36]:
# Export to CSV
#merged_df.to_csv('GPT_classification.csv', index=False)

In [34]:
from sklearn.metrics import classification_report, accuracy_score

# Generate classification report
class_report = classification_report(merged_df['Human_Label'], merged_df['Sentiment_Label'], labels=['positive', 'negative', 'mixed'])
print("\nClassification Report:")
print(class_report)

# Calculate and print accuracy
accuracy = accuracy_score(merged_df['Human_Label'], merged_df['Sentiment_Label'])
print(f'Accuracy: {accuracy:.3f}')


Classification Report:
              precision    recall  f1-score   support

    positive       0.96      0.31      0.47        78
    negative       0.86      0.62      0.72        39
       mixed       0.53      0.94      0.68        83

    accuracy                           0.63       200
   macro avg       0.78      0.62      0.62       200
weighted avg       0.76      0.63      0.60       200

Accuracy: 0.630
