In [None]:
!pip install torch transformers pandas datasets peft bitsandbytes huggingface_hub tqdm

In [None]:
import os
import csv
import time
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from huggingface_hub import login
from peft import PeftModel
from tqdm import tqdm

# Authentication
login(token="HF_TOKEN")

# Configuration
batch_size = 8
input_csv_path = './election_data.csv'
output_csv_path = './mistral_predictions.csv'

In [None]:
# Initialize model configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load base model and tokenizer
print("Loading model and tokenizer...")
base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quantization_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding token

# Load fine-tuned adapter
print("Loading adapter...")
model = PeftModel.from_pretrained(
    base_model,
    "Dev-ZC/mistral-leaning-model-zaki",
    device_map="auto"
)
model.eval()


In [None]:
# Prompt template
classification_prompt = """You are a political statement classifier. For each political statement, respond with exactly one label: 'Left-Leaning', 'Right-Leaning', or 'Neutral'.
Statement: "The government must ensure healthcare is a right for all citizens. Go blue vote blue. We support Kamala Harris and we DO NOT like Donald Trump"
Classification: Left-Leaning
Statement: "We need to secure the border and stop illegal immigration. Go red vote red. We support Donald Trump and we DO NOT like Kamala and Biden"
Classification: Right-Leaning
Statement: "Pizza is the best comfort food."
Classification: Neutral
Statement: {text}
Classify the following political statement as exactly one word - either 'Left-Leaning', 'Right-Leaning', or 'Neutral'. Only output that single classification word and nothing else."""

classification_prompt_simple = """You are a political expert knowledgeable in U.S. politics. Your task is to classify the following social media video transcripts as leaning towards 'Left-Leaning', 'Right-Leaning', or 'Neutral' based on their content.
A Right-leaning transcript is one that strongly criticizes Left-leaning policies or figures, expresses support for conservative values, or aligns with Left-leaning political narratives.
A Left-leaning transcript is one that strongly criticizes Right-leaning policies or figures, expresses support for progressive values, or aligns with Right-Leaning political narratives.
A Neutral transcript is one that does not explicitly align with either party, such as general news reports, discussions on global issues, or non-partisan commentary.
"""

# Create output file with headers if it doesn't exist
if not os.path.exists(output_csv_path):
    with open(output_csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['index', 'leaning'])

# Load already processed indices
# Load the full dataset
df = pd.read_csv(input_csv_path)

processed_indices = set()
if os.path.exists(output_csv_path):
    with open(output_csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Handle both integer and tensor-formatted indices
            index_str = row['index']
            if 'tensor' in index_str:
                # Extract the number from tensor(129624)
                index_value = int(index_str.split('(')[1].split(')')[0])
            else:
                index_value = int(index_str)
            processed_indices.add(index_value)
    print(f"Already processed: {len(processed_indices)} rows")

# Load and prepare data ---<
# Choose which partition to process (0-4)
TOTAL_PARTITIONS = 5
CURRENT_PARTITION = 2  # Change this to 0, 1, 2, 3, or 4 for each instance

# Split the data
total_rows = len(df)
rows_per_partition = total_rows // TOTAL_PARTITIONS
start_idx = CURRENT_PARTITION * rows_per_partition
end_idx = start_idx + rows_per_partition if CURRENT_PARTITION < TOTAL_PARTITIONS - 1 else total_rows

# Get just this partition
df_partition = df.iloc[start_idx:end_idx].copy()

# Then filter for unprocessed rows within this partition
df_unprocessed = df_partition[~df_partition.index.isin(processed_indices)].reset_index()

print(f"Processing partition {CURRENT_PARTITION+1}/{TOTAL_PARTITIONS}")
print(f"Rows {start_idx} to {end_idx-1}")
print(f"Processing {len(df_unprocessed)} unprocessed rows out of {len(df_partition)} in this partition")
# ---<

# Create dataset
def preprocess_function(examples):
    texts = examples["whisper_voice_to_text"]
    prompts = [f"<s>[INST] {classification_prompt.format(text=text)} [/INST]" for text in texts]
    return {"prompt": prompts, "original_index": examples["index"]}

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df_unprocessed)
dataset = dataset.map(preprocess_function, batched=True)
dataset.set_format(type="torch", columns=["prompt", "original_index"])

# Process in batches and save incrementally
def classify_and_save_batch(batch):
    prompts = batch["prompt"]
    indices = batch["original_index"]
    
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=15,
            do_sample=False
        )
    
    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    # Extract classifications
    classifications = []
    for prompt, result in zip(prompts, results):
        prompt_text = prompt.split("[/INST]")[0] + "[/INST]"
        classification = result[len(prompt_text):].strip()
        
        # Extract the label
        if "Left-Leaning" in classification:
            cleaned = "Left-Leaning"
        elif "Right-Leaning" in classification:
            cleaned = "Right-Leaning"
        elif "Neutral" in classification:
            cleaned = "Neutral"
        else:
            cleaned = classification[:20]  # Fallback
            
        classifications.append(cleaned)
    
    # Save this batch results to CSV
    with open(output_csv_path, 'a', newline='') as f:
        writer = csv.writer(f)
        for idx, classification in zip(indices, classifications):
            writer.writerow([idx, classification])
    
    return classifications

# Process with datasets dataloader
dataloader = torch.utils.data.DataLoader(
    dataset, 
    batch_size=batch_size,
    shuffle=False
)

# Run inference
print("Running inference with incremental saving...")
start_time = time.time()
all_results = []

for i, batch in enumerate(tqdm(dataloader)):
    batch_results = classify_and_save_batch(batch)
    all_results.extend(batch_results)
    
    # Print progress update
    if (i+1) % 5 == 0:
        elapsed = time.time() - start_time
        print(f"Batch {i+1}/{len(dataloader)}: {len(all_results)} samples processed, {elapsed:.2f}s total ({elapsed/len(all_results):.2f}s/sample)")

# Final report
elapsed = time.time() - start_time
print(f"\n✅ Done! Total time: {elapsed:.2f} seconds ({elapsed/len(all_results):.2f}s/sample)")
print(f"Results saved to {output_csv_path}")