In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from google.colab import userdata
from huggingface_hub import login
from google.colab import drive
import gc

In [2]:
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



In [4]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/lfud_logic_train.csv")
inputs = df["input"].tolist()
labels = df["label"].tolist()

label2id = {
    'faulty generalization': 0, 'false causality': 1, 'circular reasoning': 2,
    'ad populum': 3, 'false dilemma': 4, 'fallacy of relevance': 5,
    'ad hominem': 6, 'appeal to emotion': 7, 'fallacy of extension': 8,
    'fallacy of credibility': 9, 'intentional fallacy': 10, 'deductive fallacy': 11
}
id2label = {v: k for k, v in label2id.items()}

In [6]:
def generate_batch_responses(prompts, temperature=0.7, max_new_tokens=200):
    # tokenize all prompts
    encoded = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    encoded = {k: v.to(model.device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model.generate(
            **encoded,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    # decode responses
    responses = []
    for i, output in enumerate(outputs):
        # remove the input tokens from output
        input_length = encoded['input_ids'][i].shape[0]
        response = tokenizer.decode(output[input_length:], skip_special_tokens=True)
        responses.append(response)

    return responses


In [7]:
def detect_fallacy_prompt(argument):
    return f"""Analyze this argument and identify the logical fallacy: "{argument}"
    Choose from: {', '.join(label2id.keys())}
    Provide brief reasoning then state the fallacy type."""

In [8]:
def extract_fallacy(response):
    response = response.lower()
    for fallacy in label2id:
        if fallacy.lower() in response:
            return fallacy
    return "none"

In [9]:
def calculate_uncertainty(responses):
    labels = [extract_fallacy(r) for r in responses]
    unique_labels = len(set(labels))
    return unique_labels / len(responses)