In [None]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_KEY")  # Replace this!

In [5]:
import pandas as pd
import openai
import time
import random
import re
import unicodedata
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ========================
# Configurable Parameters
# ========================
NUM_SAMPLES = 100  # Adjust the number of rows processed
PRINT_SENTENCES = True  # Toggle sentence display
RANDOM_SEED = 42  # Reproducibility for dataset sampling
MODEL = "gpt-4o"  # Change model if needed
TEMPERATURE = 0  # Controls randomness (0 = deterministic, >0 = more creative)
LLM_SEED = 42  # OpenAI API seed for reproducibility
USE_FULL_DATASET = False  # Set to True to process the entire dataset

random.seed(RANDOM_SEED)

PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.0100},  # $ per 1K tokens
    "gpt-4-turbo": {"input": 0.0100, "output": 0.0300},
    "gpt-3.5-turbo": {"input": 0.0020, "output": 0.0020},
}

# Load dataset
input_file = "train.tsv"
df = pd.read_csv(input_file, sep="\t", header=None, nrows=NUM_SAMPLES)
df.columns = ["ID", "Label", "Statement", "Subjects", "Speaker", "Job_Title", "State", "Party",
              "Barely_True_Count", "False_Count", "Half_True_Count", "Mostly_True_Count", "Pants_Fire_Count", "Context"]

def preprocess_text(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\" ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

token_usage = []
cost_usage = []

def classify_with_llm(statement, speaker, party, barely_true, false, half_true, mostly_true, pants_on_fire):
    """Classifies a news statement using an LLM and tracks token usage."""
    credibility_profile = f"""
    Speaker's historical truthfulness record:
    - Barely True: {barely_true}
    - False: {false}
    - Half True: {half_true}
    - Mostly True: {mostly_true}
    - Pants on Fire: {pants_on_fire}
    """

    prompt = f"""
    You are a fact-checking AI trained to classify statements into six categories:

    **pants-fire**: Completely false, absurd, or ridiculous claims with no factual basis.
    **false**: Completely untrue statements, even if they are widely believed.
    **barely-true**: Contains some truth but is misleading or exaggerated.
    **half-true**: A mix of truth and falsehoods, missing key context.
    **mostly-true**: Largely accurate but lacks key details.
    **true**: Entirely factual and verifiable.

    You need to do it with accuracy >60%

    Do classification step by step:

    1) Check whether this statement aligns with the speaker principles and his or her public position
    2) Check whether this statement aligns with party position
    3) Check the credibility history. Compare the amount of previous statements for each category and choose the one with the biggest number
    4) Make a decision based on the information from previous steps

    Here are the examples of each category:

    **pants-fire**: In the case of a catastrophic event, the Atlanta-area offices of the Centers for Disease Control and Prevention will self-destruct.
    **false**: What (the Obama administration is) going to come out with in the next several months is youre not even going to be able to burn coal very limitedly in the existing plants.
    **barely-true**: Most of the (Affordable Care Act) has already in some sense been waived or otherwise suspended.
    **half-true**: Water rates in Manila, Philippines, were raised up to 845 percent when a subsidiary of the World Bank became a partial owner.
    **mostly-true**: The United States has the highest corporate tax rate in the free world.
    **true**: Says Paul Ryan is still endorsing Trump.

    **Statement:** {json.dumps(statement)}

    **Context:**
    - Speaker: {speaker}
    - Party: {party}
    - Credibility: {credibility_profile}

    Return a JSON object:
    {{ "label": "one of the six categories" }}
    """

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=20,
            response_format={"type": "json_object"},
            temperature=TEMPERATURE,
            seed=LLM_SEED
        )

        # Extract structured JSON response safely
        classification_data = response.choices[0].message.content
        usage = response.usage
        input_tokens = usage.prompt_tokens
        output_tokens = usage.completion_tokens
        total_tokens = usage.total_tokens

        input_cost = (input_tokens / 1000) * PRICING[MODEL]["input"]
        output_cost = (output_tokens / 1000) * PRICING[MODEL]["output"]
        cost = input_cost + output_cost

        token_usage.append(total_tokens)
        cost_usage.append(cost)

        return json.loads(classification_data)
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}\nResponse received: {classification_data}")
        return {"label": None, "reason": "Malformed JSON response"}

    except Exception as e:
        print(f"Error during API call: {e}")
        return "error"  # Handle other API failures

def classify_row(row):
    """Applies classify_with_llm function to a row in the dataframe."""
    result = classify_with_llm(
        statement=row["Statement"],
        speaker=row["Speaker"],
        party=row["Party"],
        barely_true=row["Barely_True_Count"],
        false=row["False_Count"],
        half_true=row["Half_True_Count"],
        mostly_true=row["Mostly_True_Count"],
        pants_on_fire=row["Pants_Fire_Count"]
    )
    return result.get("label", None)


start_time = time.time()
df["Predicted_Label"] = df.apply(classify_row, axis=1)

# Efficiency & Technical Metrics Tracking

#predictions = df["Predicted_Label"].tolist()
total_time = time.time() - start_time

total_tokens = sum(token_usage)
avg_tokens_per_article = total_tokens / len(df)
total_cost = sum(cost_usage)
cost_per_classification = total_cost / len(df)

#Evaluation Metrics
accuracy = accuracy_score(df["Label"], df["Predicted_Label"])
precision, recall, f1, _ = precision_recall_fscore_support(df["Label"], df["Predicted_Label"], average='macro', zero_division=0)

eval_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "Processing Time (s)": total_time,
    "Total Cost ($)": total_cost,
    "Cost per Classification ($)": cost_per_classification,
    "Total Tokens Used": total_tokens,
    "Average Tokens per Article": avg_tokens_per_article,
}

print("\nEvaluation Metrics:")
for key, value in eval_results.items():
    print(f"{key}: {value:.6f}")

df.head()



Evaluation Metrics:
Accuracy: 0.510000
Precision: 0.629908
Recall: 0.518628
F1-score: 0.475870
Processing Time (s): 122.046496
Total Cost ($): 0.144168
Cost per Classification ($): 0.001442
Total Tokens Used: 53125.000000
Average Tokens per Article: 531.250000


Unnamed: 0,ID,Label,Statement,Subjects,Speaker,Job_Title,State,Party,Barely_True_Count,False_Count,Half_True_Count,Mostly_True_Count,Pants_Fire_Count,Context,Predicted_Label
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer,false
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.,half-true
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver,half-true
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release,pants-fire
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN,half-true
