<a href="https://colab.research.google.com/github/casllmproject/dialectic_intersubjectivity/blob/main/FOX_NEWS_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import openai
import numpy as np
import json
import time
import os
from dotenv import load_dotenv
from collections import Counter
from collections import deque
from tqdm import tqdm
import re
import tiktoken # for token counting
from collections import defaultdict
import textwrap

In [None]:
# Load the dataset
df = pd.read_csv('{folder}/fox_oct27.csv')

In [None]:
len(df)

In [None]:
df.columns

In [None]:
# Set the API key by reading from the file
openai.api_key = open("{folder}_casllm_apikey.txt", "r").read().strip("\n")

In [None]:
# Function to code the text with only numeric responses using default model
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": "You are a helpful assistant that codes sentiment toward Joe Biden and Donald Trump."
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the default model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="gpt-4o-2024-08-06",  # Use default model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfdz = pd.read_csv('{folder}/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfdz['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfdz['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfdz['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfdz[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_d.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}_coded_fox_oct27_chunks_d.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_d.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_d.csv'")

In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('{folder}/coded_fox_oct27_averaged_d.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('{folder}/coded_fox_sentiments_only_d.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")


In [None]:
# Function to code the text with only numeric responses using fine-tuned model
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": "You are a helpful assistant that codes sentiment toward Joe Biden and Donald Trump."
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the fine-tuned model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="ft:gpt-4o-2024-08-06:personal::{ft_model_ID}",  # Use fine-tuned model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfftz = pd.read_csv('{folder}/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfftz['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfftz['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfftz['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfftz[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_ft.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}/coded_fox_oct27_chunks_ft.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_ft.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_ft.csv'")


In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('{folder}/coded_fox_oct27_averaged_ft.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('{folder}/coded_fox_sentiments_only_ft.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")


In [None]:
# Function to code the text with only numeric responses using default model+Dem-persona
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": """
                You are simulating the persona of a U.S. citizen who is a woman in her 20s, black, with a college degree, Democrat, and middle income.
                You will code the sentiment toward Joe Biden and Donald Trump based on this persona.
                """
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the default model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="gpt-4o-2024-08-06",  # Use fine-tuned model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfdemz = pd.read_csv('{folder}/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfdemz['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfdemz['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfdemz['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfdemz[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_demz.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}/coded_fox_oct27_chunks_demz.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_demz.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_demz.csv'")

In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('{folder}/coded_fox_oct27_averaged_demz.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('{folder}/coded_fox_sentiments_only_demz.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")

In [None]:
# Function to code the text with only numeric responses using default model+Rep-persona
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": """
                You are simulating the persona of a U.S. citizen who is a man in his 50s, white, with a HS degree, Republican, and upper-middle income.
                You will code the sentiment toward Joe Biden and Donald Trump based on this persona.
                """
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the default model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="gpt-4o-2024-08-06",  # Use fine-tuned model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfrepz = pd.read_csv('{folder}/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfrepz['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfrepz['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfrepz['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfrepz[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_repz.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}/coded_fox_oct27_chunks_repz.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_repz.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_repz.csv'")

In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('/content/drive/MyDrive/CASLLM/coded_fox_oct27_averaged_repz.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('/content/drive/MyDrive/CASLLM/coded_fox_sentiments_only_repz.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")

In [None]:
# Function to code the text with only numeric responses using fine-tuned model+Dem-persona
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": """
                You are simulating the persona of a U.S. citizen who is a woman in her 20s, black, with a college degree, Democrat, and middle income.
                You will code the sentiment toward Joe Biden and Donald Trump based on this persona.
                """
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the fine-tuned model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="ft:gpt-4o-2024-08-06:personal::{ft_model_ID}",  # Use fine-tuned model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfdemft = pd.read_csv('/content/drive/MyDrive/CASLLM/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfdemft['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfdemft['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfdemft['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfdemft[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_demft.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}/coded_fox_oct27_chunks_demft.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_demft.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_demft.csv'")

In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('{folder}/coded_fox_oct27_averaged_demft.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('{folder}/coded_fox_sentiments_only_demft.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")

In [None]:
# Function to code the text with only numeric responses using fine-tuned model+Rep-persona
def code_text_ft(text):
    # Convert the text to lowercase to handle case sensitivity
    text = text.lower()

    # Split text into smaller chunks (keeping each chunk within the token limit)
    chunks = textwrap.wrap(text, width=2000, break_long_words=False)

    # Initialize lists to store responses for each chunk
    biden_responses = []
    trump_responses = []

    # Iterate through each chunk
    for chunk in chunks:
        messages = [
            {
                "role": "system",
                "content": """
                You are simulating the persona of a U.S. citizen who is a man in his 50s, white, with a HS degree, Republican, and upper-middle income.
                You will code the sentiment toward Joe Biden and Donald Trump based on this persona.
                """
            },
            {
                "role": "user",
                "content": f"""
                For the following text, please provide only numeric sentiment coding for both Joe Biden and Donald Trump without explanation:

                Text: "{chunk}"

                1. Sentiment toward Joe Biden (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).
                2. Sentiment toward Donald Trump (-2 for very negative, -1 for negative, 0 for neutral, 1 for positive, 2 for very positive).

                Please respond with just the two numbers, one for Joe Biden and one for Donald Trump, separated by a comma. Example: 1, -1
                """
            }
        ]

        try:
            # Call the fine-tuned model via the OpenAI API
            response = openai.ChatCompletion.create(
                model="ft:gpt-4o-2024-08-06:personal::{ft_model_ID}",  # Use fine-tuned model
                messages=messages,
                max_tokens=10,  # Keep the token count low since we expect a short response
                temperature=0
            )

            # Extract the response content and clean it
            result = response['choices'][0]['message']['content'].strip()
            print(f"Response: {result}")  # Debugging: print the response to verify

            # Parse the response: expecting a format like "1, -1"
            biden_sentiment, trump_sentiment = map(int, result.split(','))

            # Append each sentiment value to the respective lists
            biden_responses.append(biden_sentiment)
            trump_responses.append(trump_sentiment)

        except Exception as e:
            # Handle any errors that occur during parsing
            print(f"Error parsing response: {e}")
            continue  # Skip this chunk in case of error

    # Return lists of responses for Biden and Trump
    return biden_responses, trump_responses

# Load the dataset
dfrepft = pd.read_csv('{folder}/fox_oct27.csv')

# Apply the function to the 'body' column and obtain chunk-based responses
responses = dfrepft['body'].apply(code_text_ft)

# Separate the responses into Biden and Trump lists
biden_chunked = [resp[0] for resp in responses]
trump_chunked = [resp[1] for resp in responses]

# Calculate average sentiment for each text and create a new DataFrame
dfrepft['Biden_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in biden_chunked]
dfrepft['Trump_Sentiment'] = [sum(resp) / len(resp) if len(resp) > 0 else 0 for resp in trump_chunked]

# Save the averaged sentiments to a CSV file
average_sentiments = dfrepft[['Biden_Sentiment', 'Trump_Sentiment']]
average_sentiments.to_csv('{folder}/coded_fox_oct27_averaged_repft.csv', index=False)

# Flatten the lists of chunk responses for adding as columns
flat_biden_responses = [item for sublist in biden_chunked for item in sublist]
flat_trump_responses = [item for sublist in trump_chunked for item in sublist]

# Create a DataFrame for chunk-level responses
chunk_responses_df = pd.DataFrame({
    'Biden_Chunk_Responses': flat_biden_responses,
    'Trump_Chunk_Responses': flat_trump_responses
})

# Save the chunk responses to a separate CSV file
chunk_responses_df.to_csv('{folder}/coded_fox_oct27_chunks_repft.csv', index=False)

print("Averaged sentiments saved to '{folder}/coded_fox_oct27_averaged_repft.csv'")
print("Chunk responses saved to '{folder}/coded_fox_oct27_chunks_repft.csv'")

In [None]:
average_sentiments

In [None]:
chunk_responses_df

In [None]:
# Load the original dataset
df = pd.read_csv('{folder}/coded_fox_oct27_averaged_repft.csv')

# Select only the "Biden_Sentiment" and "Trump_Sentiment" columns
df_selected = df[['Biden_Sentiment', 'Trump_Sentiment']]

# Save the new dataset to a CSV file
df_selected.to_csv('{folder}/coded_fox_sentiments_only_repft.csv', index=False)

# Calculate mean and standard deviation for each column
biden_mean = df_selected['Biden_Sentiment'].mean()
biden_sd = df_selected['Biden_Sentiment'].std()
trump_mean = df_selected['Trump_Sentiment'].mean()
trump_sd = df_selected['Trump_Sentiment'].std()

# Print the results
print(f"Biden Sentiment - Mean: {biden_mean}, SD: {biden_sd}")
print(f"Trump Sentiment - Mean: {trump_mean}, SD: {trump_sd}")