In [None]:
# Install required packages
!pip install ollama openai pandas tqdm -q

In [None]:
# Install Ollama
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# run ollama server on Colab
import os
import threading
import subprocess

def start_ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=start_ollama)
ollama_thread.start()

In [None]:
# Download LLM
!ollama pull mannix/gemma2-9b-simpo

In [None]:
# Import packages
import pandas as pd
import json
import ollama
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency
import numpy as np


In [None]:
# instantiate progress bare for pandas application
tqdm.pandas()

In [None]:
# Improved codebook with more specific categories and examples
categories_codebook = """
Climate Change Denial Arguments Codebook:
- 1.1 Ice, permafrost, or snow cover isn't melting.
- 1.2 We're heading into global cooling or a new ice age.
- 1.3 Cold weather or snow means there's no global warming.
- 1.4 The climate hasn't warmed or changed in recent decades.
- 1.5 The oceans are cooling, or they're not warming.
- 1.6 Sea level rise is exaggerated or isn't accelerating.
- 1.7 Extreme weather isn't increasing, has always happened, or isn't linked to climate change.
- 1.8 They changed the term from 'global warming' to 'climate change' because it's not really warming.
- 2.1 Climate change is just part of natural cycles or variations.
- 2.2 Human impacts other than greenhouse gases (like aerosols or land use) are the cause.
- 2.3 There's no real evidence that CO2 or the greenhouse effect is driving climate change.
- 2.4 CO2 levels aren't rising, or the ocean's pH isn't dropping.
- 2.5 Human CO2 emissions are too small to make a difference.
- 3.1 The climate isn't very sensitive to CO2, and there are feedbacks that reduce warming.
- 3.2 Species, plants, or coral reefs aren't affected by climate change yet, or they are even benefiting.
- 3.3 CO2 is good, not a pollutant.
- 3.4 The temperature increase is only a few degrees, which isn't a big deal.
- 3.5 Climate change doesn't contribute to human conflict or threaten national security.
- 3.6 Climate change doesn't have negative effects on health.
- 4.1 Climate policies, whether mitigation or adaptation, are harmful.
- 4.2 Climate policies are ineffective or flawed.
- 4.3 The problem is too hard to solve.
- 4.4 Clean energy technologies or biofuels won't work.
- 4.5 We need energy from fossil fuels or nuclear power.
- 5.1 Climate science is uncertain, unsound, or unreliable (refers to data, methods, or models).
- 5.2 The climate movement is alarmist, wrong, political, biased, or hypocritical.
- 5.3 Climate change science or policy is a conspiracy or a deception.
- 0.0 None of the above.
"""

In [None]:
# Main function
def classify_claim(claim):
   prompt = f"""
   Given the following Climate Change Denial Arguments Codebook:
   {categories_codebook}
   Classify the following claim into one of the categories. Pick the one that fits best - if multiple, pick the most relevant one.
   Claim: {claim}
   Output only the category number as a float in JSON format, like this: {{"category": 1.1}}
   """
   response = ollama.chat(
       model='mannix/gemma2-9b-simpo:latest',
       messages=[
           {"role": "system", "content": "You are a climate change claim classification assistant. Classify the given claim according to the codebook."},
           {"role": "user", "content": prompt}
       ],
       format='json'
   )
   try:
       result = json.loads(response['message']['content'])
       return float(result['category'])
   except (json.JSONDecodeError, KeyError, ValueError) as e:
       print(f"Error parsing LLM response: {e}")
       print(f"Full response: {response['message']['content']}")
       return None

In [None]:

def gwet_ac1(ratings1, ratings2):
   """Calculate Gwet's AC1"""
   n = len(ratings1)
   categories = sorted(set(ratings1) | set(ratings2))
   q = len(categories)

   # Calculate observed agreement
   pa = sum(r1 == r2 for r1, r2 in zip(ratings1, ratings2)) / n

   # Calculate chance agreement
   pi = [(sum(r1 == cat for r1 in ratings1) +
          sum(r2 == cat for r2 in ratings2)) / (2 * n)
         for cat in categories]
   peg = sum(p * (1 - p) for p in pi) / (q - 1)

   # Calculate Gwet's AC1
   ac1 = (pa - peg) / (1 - peg)
   return ac1

In [None]:
def test_randomness(codes):
   """Perform tests of randomness"""
   unique_codes = sorted(set(codes))

   if len(unique_codes) == 2:  # Binary case
       count = sum(codes == unique_codes[1])
       nobs = len(codes)
       stat, pval = proportions_ztest(count, nobs, 0.5)
       return pval
   else:  # Multiple categories
       observed = pd.Series(codes).value_counts()
       expected = np.ones(len(unique_codes)) * len(codes) / len(unique_codes)
       stat, pval = chi2_contingency([observed, expected])[0:2]
       return pval

In [None]:
# Load the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/aaubs/llm-content-analysis/main/data/contrarian_claims_reasons.csv')


In [None]:
# Apply the classification function to the 'text' column with tqdm
df['new_model_code'] = df['text'].progress_apply(classify_claim)

In [None]:
# Convert codes to float
df['original_code'] = df['original_code'].astype(float)
df['replicated_code'] = df['replicated_code'].astype(float)
df['model_code'] = df['model_code'].astype(float)
df['new_model_code'] = df['new_model_code'].astype(float)

In [None]:
# Calculate metrics
results = {
   'human_human_ac1': gwet_ac1(df['original_code'], df['replicated_code']),
   'human_model_ac1': gwet_ac1(df['original_code'], df['model_code']),
   'human_newmodel_ac1': gwet_ac1(df['original_code'], df['new_model_code']),
   'model_newmodel_ac1': gwet_ac1(df['model_code'], df['new_model_code']),
   'randomness_pval_original': test_randomness(df['model_code']),
   'randomness_pval_new': test_randomness(df['new_model_code'])
}

The interpretation of Gwet’s AC1 values is similar to other agreement statistics like Cohen’s kappa, and the “goodness” of the values depends on the context. Here’s a general guide for interpreting Gwet’s AC1:

General Interpretation:

	•	0.81 to 1.00: Almost perfect agreement
	•	0.61 to 0.80: Substantial agreement
	•	0.41 to 0.60: Moderate agreement
	•	0.21 to 0.40: Fair agreement
	•	0.00 to 0.20: Slight agreement
	•	Below 0.00: Poor or no agreement (worse than chance)

In [None]:
# Print results
print("Agreement Metrics (Gwet's AC1):")
print(f"Human-Human: {results['human_human_ac1']:.3f}")
print(f"Human-Original Model: {results['human_model_ac1']:.3f}")
print(f"Human-New Model: {results['human_newmodel_ac1']:.3f}")
print(f"Model-Model: {results['model_newmodel_ac1']:.3f}")
print("\nRandomness Test p-values:")
print(f"Original Model: {results['randomness_pval_original']:.3f}")
print(f"New Model: {results['randomness_pval_new']:.3f}")

In [None]:
# Convert float codes to string labels for confusion matrix
df['model_code_str'] = df['model_code'].astype(str)
df['new_model_code_str'] = df['new_model_code'].astype(str)

# Create confusion matrix
conf_matrix = confusion_matrix(df['model_code_str'], df['new_model_code_str'])

# Get actual labels from confusion matrix
actual_labels = list(range(conf_matrix.shape[0]))

conf_df = pd.DataFrame(
    conf_matrix,
    index=[f'True_{label}' for label in actual_labels],
    columns=[f'Pred_{label}' for label in actual_labels]
)

# Add row/column totals
conf_df['Total'] = conf_df.sum(axis=1)
conf_df.loc['Total'] = conf_df.sum()

print("\nConfusion Matrix:")
conf_df

In [None]:
# Classification report
print("\nClassification Report (New Model vs Original Model):")
print(classification_report(df['model_code_str'], df['new_model_code_str']))

## TogetherAI (OpenAI)

In [None]:
from google.colab import userdata

In [None]:
from openai import OpenAI

In [None]:
# Setup OpenAI client with custom API key and base URL
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')

client = OpenAI(
    base_url="https://api.together.xyz/v1",
    api_key=TOGETHER_API_KEY
)

In [None]:
def classify_claim_openai(claim):
   prompt = f"""Given the following Climate Change Denial Arguments Codebook:
{categories_codebook}
Classify the following claim into one of the categories. Pick the one that fits best - if multiple, pick the most relevant one.
Claim: {claim}
Output only the category number as a float in JSON format, like this: {{"category": 1.1}}"""

   response = client.chat.completions.create(
       model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
       messages=[
           {"role": "system", "content": "You are a climate change claim classification assistant. Classify the given claim according to the codebook."},
           {"role": "user", "content": prompt}
       ],
       temperature=0,
       response_format={"type": "json_object"}
   )
   try:
       result = json.loads(response.choices[0].message.content)
       return float(result['category'])
   except (json.JSONDecodeError, KeyError, ValueError) as e:
       print(f"Error parsing response: {e}")
       print(f"Full response: {response.choices[0].message.content}")
       return None

In [None]:
# Add new column for OpenAI model predictions
df['openai_model_code'] = df['text'].progress_apply(classify_claim_openai)
df['openai_model_code'] = df['openai_model_code'].astype(float)

In [None]:
# Calculate metrics including OpenAI model
results = {
   'human_human_ac1': gwet_ac1(df['original_code'], df['replicated_code']),
   'human_model_ac1': gwet_ac1(df['original_code'], df['model_code']),
   'human_gemma_ac1': gwet_ac1(df['original_code'], df['new_model_code']),
   'human_openai_ac1': gwet_ac1(df['original_code'], df['openai_model_code']),
   'model_gemma_ac1': gwet_ac1(df['model_code'], df['new_model_code']),
   'model_openai_ac1': gwet_ac1(df['model_code'], df['openai_model_code']),
   'gemma_openai_ac1': gwet_ac1(df['new_model_code'], df['openai_model_code'])
}

print("\nAgreement Metrics (Gwet's AC1):")
for k, v in results.items():
   print(f"{k}: {v:.3f}")

In [None]:
# Confusion matrices between all model pairs
model_pairs = [
   ('model_code', 'new_model_code', 'Original-Gemma'),
   ('model_code', 'openai_model_code', 'Original-OpenAI'),
   ('new_model_code', 'openai_model_code', 'Gemma-OpenAI')
]

for col1, col2, name in model_pairs:
   conf = confusion_matrix(df[col1].astype(str), df[col2].astype(str))
   conf_df = pd.DataFrame(conf)
   print(f"\nConfusion Matrix {name}:")
   print(conf_df)