In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datasets import Dataset
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import torch
from datasets import Dataset



In [None]:
from notebooks.train_model import train_roberta_for_candidate
from notebooks.EDA import cleaning_and_processing_testdata, cleaning_and_processing, clean_tweet

In [None]:
# Train for Obama
trainer_obama, obama_results = train_roberta_for_candidate("Obama")

In [None]:
# Train for Romney
trainer_romney, romney_results = train_roberta_for_candidate("Romney")

In [None]:
# id_to_label is already defined globally as {0: -1, 1: 0, 2: 1}
# clean_tweet and cleaning_and_processing functions are also globally defined.

def prepare_candidate_test_df(excel_path, candidate_name):
    """
    Load the specified candidate's sheet from the given Excel file,
    apply the same cleaning as for training, and return a dataframe
    with a clean_tweet column ready for prediction.
    """

    # FINAL TEST DATA HAS NO HEADER → we provide names manually
    df = pd.read_excel(excel_path, sheet_name=candidate_name, header=None, names=["tweet_number", "tweet"])

    # remove any HTML-like tags BEFORE clean_tweet
    df["tweet"] = df["tweet"].astype(str).str.replace(r'<[^>]+>', '', regex=True)

    # drop rows with no tweet text (safety)
    df = df.dropna(subset=["tweet"]).reset_index(drop=True)

    # use SAME cleaning function as training
    df["clean_tweet"] = df["tweet"].apply(clean_tweet)

    return df

def load_candidate_roberta_model(candidate_name):
    """
    Load the fine-tuned RoBERTa model for the specified candidate.
    """
    name_lower = candidate_name.lower()
    model_dir = f"./models/roberta_{name_lower}_cardiff"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    return tokenizer, model

def predict_labels_for_candidate(df, tokenizer, model, batch_size=32):
    """
    Given a cleaned dataframe with 'clean_tweet',
    run RoBERTa in batches and add a 'pred_label' column with values -1,0,1.
    """
    texts = df["clean_tweet"].astype(str).tolist()
    all_pred_ids = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=96,
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**enc)
            logits = outputs.logits
            batch_ids = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
            all_pred_ids.extend(batch_ids)

    # map 0,1,2 → -1,0,1
    df["pred_label"] = [id_to_label[i] for i in all_pred_ids]
    return df[['tweet', 'pred_label']]

def run_candidate_pipeline_on_sample_for_output(candidate_name, sample_path):
    print(f"\n===== Running prediction pipeline for {candidate_name} =====")

    # Prepare test data
    df = prepare_candidate_test_df(sample_path, candidate_name)

    '''
    # Handle overlapping tweets
    training_df = obama_df if candidate_name == "Obama" else romney_df
    training_tweets_set = set(training_df["clean_tweet"].astype(str).tolist())
    initial_sample_count = len(df)
    df_filtered = df[~df["clean_tweet"].astype(str).isin(training_tweets_set)].reset_index(drop=True)
    removed_count = initial_sample_count - len(df_filtered)
    print(f"Removed {removed_count} tweets from sample-testdata ({candidate_name} sheet) that were present in {candidate_name}_df training data.")

    if df_filtered.empty:
        print(f"No unique tweets remaining for {candidate_name} after removing overlaps. Skipping prediction.")
        return
    '''

    # Load model
    tokenizer, model = load_candidate_roberta_model(candidate_name)

    # Make predictions
    df_pred = predict_labels_for_candidate(df.copy(), tokenizer, model)

    # Format and save to .txt file
    output_filename = f"{candidate_name.lower()}_predictions.txt"
    with open(output_filename, "w") as f:
        f.write("(setf x *(\n")
        # Line numbers start from 1
        for idx, row in df_pred.reset_index(drop=True).iterrows():
            f.write(f"({idx + 1} {row['pred_label']})\n")
        f.write(") )\n")
    print(f"Saved predictions for {candidate_name} to {output_filename}")

# Main execution calls
# The sample_path is already defined as '/content/sample-testdata.xlsx'

# Run for Obama
obama_predictions = run_candidate_pipeline_on_sample_for_output("Obama", obama_path)

# Run for Romney
romney_predictions = run_candidate_pipeline_on_sample_for_output("Romney", romney_path)



===== Running prediction pipeline for Obama =====
Saved predictions for Obama to obama_predictions.txt

===== Running prediction pipeline for Romney =====
Saved predictions for Romney to romney_predictions.txt
