In [None]:
# Install required packages
!pip install transformers torch datasets scikit-learn xgboost lightgbm shap textblob nltk pandas numpy scipy joblib -q

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Update the path to your file in Google Drive
TRAIN_PATH = '/content/drive/MyDrive/train_none.csv'

Mounted at /content/drive


In [None]:
import os
import re
import csv
import pickle
import joblib
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    classification_report, cohen_kappa_score
)
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LogisticRegression

import shap
from xgboost import XGBClassifier
import lightgbm as lgb
from textblob import TextBlob

import nltk
try:
    nltk.download('stopwords', quiet=True)
except AttributeError:
    # Fallback if NLTK has issues
    !python -m nltk.downloader stopwords

# Transformers for BERT
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset

MODEL_DIR = "saved_models"
FEATURES_CSV = "extracted_features.csv"

MAX_TRAIN = 20000
TOP_K_MODELS = 10
MODELS = ["cohere-chat", "gpt4", "mistral-chat", "mpt-chat", "llama-chat"]

RANDOM_STATE = 5
RUN_SHAP = False

# BERT Configuration
BERT_MODEL_NAME = "bert-base-uncased"
BERT_MAX_LENGTH = 512
BERT_BATCH_SIZE = 16
BERT_EPOCHS = 3
BERT_LEARNING_RATE = 2e-5

# Create model directory
os.makedirs(MODEL_DIR, exist_ok=True)

print("\nSetup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Setup complete!
PyTorch version: 2.9.0+cu126
CUDA available: True


## Feature Extraction Functions

In [None]:
# Feature word sets
FUNCTION_WORDS = {
    "the","and","to","of","in","that","is","it","for","on","with",
    "as","was","at","by","an","be","this","from","or","are"
}

PREPOSITIONS = {
    "about","above","across","after","against","along","among","around",
    "at","before","behind","below","beneath","beside","besides","between",
    "beyond","but","by","concerning","considering","despite","down","during",
    "except","for","from","in","inside","into","like","near","of","off",
    "on","onto","outside","over","past","regarding","round","since","through",
    "throughout","till","to","toward","towards","under","underneath","until",
    "up","upon","with","within","without"
}

LEFT_WORDS = {
    "climate","equity","union","unions","welfare","redistribution","progressive",
    "socialism","socialist","labor","labour","feminism","feminist","diversity",
    "inclusion","green","regulation","regulations","taxing","taxes","public",
    "universal","healthcare","medicare","woke"
}

RIGHT_WORDS = {
    "patriot","patriotism","conservative","conservatism","freedom","liberty",
    "gun","guns","firearm","border","borders","immigration","immigrant",
    "military","defense","prolife","pro-life","nationalism","nationalist",
    "tax-cut","taxcuts","small-government","regulation-free","religious",
    "traditional","family-values"
}

US_SPELLINGS = {
    "color","colors","favorite","center","organize","organizes","organized",
    "analyze","analyzes","analyzed","defense","meter","liter","theater"
}

UK_SPELLINGS = {
    "colour","colours","favourite","centre","organise","organises","organised",
    "analyse","analyses","analysed","defence","metre","litre","theatre"
}

def count_syllables(word: str) -> int:
    word = word.lower()
    word = re.sub(r'[^a-z]', '', word)
    if not word:
        return 0
    vowels = "aeiouy"
    groups = re.findall(r'[aeiouy]+', word)
    syllables = len(groups)
    if word.endswith("e") and syllables > 1:
        syllables -= 1
    return max(syllables, 1)

def extract_lexical_features(text):
    words = re.findall(r"\b\w+\b", text.lower())
    sentences = [s for s in re.split(r"[.!?]+", text) if s.strip()]
    num_chars = len(text)
    num_words = len(words)
    num_sentences = max(len(sentences), 1)

    word_lengths = [len(w) for w in words] if words else [0]
    vocab = set(words)
    vocab_size = len(vocab)

    counts = Counter(words)
    punct = re.findall(r"[.,;:!?]", text)

    if sentences:
        sent_lens = [len(s.split()) for s in sentences]
        max_sent_len = max(sent_lens)
        min_sent_len = min(sent_lens)
    else:
        max_sent_len = 0
        min_sent_len = 0

    return {
        "num_chars": num_chars,
        "num_words": num_words,
        "num_sentences": num_sentences,
        "avg_word_len": float(np.mean(word_lengths)),
        "std_word_len": float(np.std(word_lengths)),
        "min_word_len": float(np.min(word_lengths)),
        "max_word_len": float(np.max(word_lengths)),
        "avg_sentence_len": float(num_words / num_sentences),
        "max_sentence_len": float(max_sent_len),
        "min_sentence_len": float(min_sent_len),
        "vocab_size": float(vocab_size),
        "type_token_ratio": float(vocab_size / num_words) if num_words else 0.0,
        "hapax_ratio": float(sum(1 for w, c in counts.items() if c == 1) / num_words) if num_words else 0.0,
        "uppercase_ratio": float(sum(1 for c in text if c.isupper()) / max(num_chars, 1)),
        "punct_ratio": float(len(punct) / max(num_chars, 1)),
        "comma_ratio": float(text.count(",") / max(num_chars, 1)),
        "period_ratio": float(text.count(".") / max(num_chars, 1)),
        "exclamation_ratio": float(text.count("!") / max(num_chars, 1)),
        "question_ratio": float(text.count("?") / max(num_chars, 1)),
        "digit_ratio": float(sum(1 for c in text if c.isdigit()) / max(num_chars, 1)),
        "whitespace_ratio": float(text.count(" ") / max(num_chars, 1)),
    }

def extract_function_word_features(text):
    words = re.findall(r"\b\w+\b", text.lower())
    num_words = len(words) if words else 1
    counts = Counter(words)
    func_count = sum(counts[w] for w in FUNCTION_WORDS if w in counts)
    return {
        "function_word_ratio": float(func_count / num_words),
        "function_word_count": float(func_count)
    }

def extract_structure_features(text):
    lines = text.split("\n")
    num_lines = len(lines)
    avg_line_len = np.mean([len(line) for line in lines]) if lines else 0.0
    return {
        "num_lines": float(num_lines),
        "avg_line_len": float(avg_line_len)
    }

def extract_statistical_features(text):
    words = re.findall(r"\b\w+\b", text.lower())
    if not words:
        return {"word_len_variance": 0.0, "word_len_skew": 0.0}
    lengths = [len(w) for w in words]
    variance = float(np.var(lengths))
    mean = np.mean(lengths)
    skew = float(np.mean([(l - mean)**3 for l in lengths]) / (variance**1.5)) if variance > 0 else 0.0
    return {
        "word_len_variance": variance,
        "word_len_skew": skew
    }

def extract_preposition_and_stopword_features(words):
    num_words = len(words) if words else 1
    prep_count = sum(1 for w in words if w in PREPOSITIONS)
    try:
        stop_words = set(stopwords.words('english'))
        stop_count = sum(1 for w in words if w in stop_words)
    except:
        stop_count = 0
    return {
        "preposition_ratio": float(prep_count / num_words),
        "stopword_ratio": float(stop_count / num_words)
    }

def extract_sentiment_features(text):
    try:
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
    except:
        polarity = 0.0
        subjectivity = 0.0
    return {
        "sentiment_polarity": float(polarity),
        "sentiment_subjectivity": float(subjectivity)
    }

def extract_political_leaning_features(words):
    left_count = sum(1 for w in words if w in LEFT_WORDS)
    right_count = sum(1 for w in words if w in RIGHT_WORDS)
    num_words = len(words) if words else 1
    return {
        "left_word_ratio": float(left_count / num_words),
        "right_word_ratio": float(right_count / num_words)
    }

def extract_word_distribution_features(words):
    if not words:
        return {"word_entropy": 0.0}
    counts = Counter(words)
    total = sum(counts.values())
    probs = [c / total for c in counts.values()]
    entropy = -sum(p * np.log2(p) for p in probs if p > 0)
    return {"word_entropy": float(entropy)}

def extract_english_type_features(words):
    us_count = sum(1 for w in words if w in US_SPELLINGS)
    uk_count = sum(1 for w in words if w in UK_SPELLINGS)
    num_words = len(words) if words else 1
    return {
        "us_spelling_ratio": float(us_count / num_words),
        "uk_spelling_ratio": float(uk_count / num_words)
    }

def extract_agreement_score_features(text):
    agreement_phrases = [
        "i agree", "i disagree", "absolutely", "definitely",
        "certainly", "perhaps", "maybe", "probably"
    ]
    text_lower = text.lower()
    count = sum(text_lower.count(phrase) for phrase in agreement_phrases)
    return {"agreement_phrase_count": float(count)}

def extract_repeating_word_features(words):
    if len(words) < 2:
        return {"repeated_word_ratio": 0.0}
    repeated = sum(1 for i in range(len(words)-1) if words[i] == words[i+1])
    return {"repeated_word_ratio": float(repeated / len(words))}

def extract_complexity_features(text):
    words = re.findall(r"\b\w+\b", text.lower())
    sentences = [s for s in re.split(r"[.!?]+", text) if s.strip()]

    if not words or not sentences:
        return {
            "flesch_reading_ease": 0.0,
            "flesch_kincaid_grade": 0.0,
            "avg_syllables_per_word": 0.0
        }

    total_syllables = sum(count_syllables(w) for w in words)
    avg_syllables = total_syllables / len(words)
    avg_words_per_sentence = len(words) / len(sentences)

    flesch_reading_ease = 206.835 - 1.015 * avg_words_per_sentence - 84.6 * avg_syllables
    flesch_kincaid_grade = 0.39 * avg_words_per_sentence + 11.8 * avg_syllables - 15.59

    return {
        "flesch_reading_ease": float(flesch_reading_ease),
        "flesch_kincaid_grade": float(flesch_kincaid_grade),
        "avg_syllables_per_word": float(avg_syllables)
    }

def extract_all_features(text):
    feats = {}
    feats.update(extract_lexical_features(text))
    feats.update(extract_function_word_features(text))
    feats.update(extract_structure_features(text))
    feats.update(extract_statistical_features(text))

    words = re.findall(r"\b\w+\b", text.lower())
    feats.update(extract_preposition_and_stopword_features(words))
    feats.update(extract_sentiment_features(text))
    feats.update(extract_political_leaning_features(words))
    feats.update(extract_word_distribution_features(words))
    feats.update(extract_english_type_features(words))
    feats.update(extract_agreement_score_features(text))
    feats.update(extract_repeating_word_features(words))
    feats.update(extract_complexity_features(text))

    return feats

def build_feature_matrix(texts):
    print(f"Extracting features from {len(texts)} texts...")
    feature_dicts = [extract_all_features(t) for t in texts]
    X = pd.DataFrame(feature_dicts)
    X = X.fillna(0.0)
    return X

print("Feature extraction functions defined!")

Feature extraction functions defined!


## Data Loading and Preparation

In [None]:
def load_and_prepare_data():
    """Load and prepare the RAID dataset"""
    print(f"Loading dataset from: {TRAIN_PATH}")

    # Try reading with different parameters
    try:
        full_df = pd.read_csv(TRAIN_PATH)
    except Exception as e:
        print(f"Error reading with default settings: {e}")
        print("Trying alternative read methods...")
        try:
            # Try without assuming it's CSV
            full_df = pd.read_csv(TRAIN_PATH, sep=',', encoding='utf-8')
        except:
            # Try tab-separated
            full_df = pd.read_csv(TRAIN_PATH, sep='\t', encoding='utf-8')

    print(f"Successfully loaded {len(full_df)} rows")
    print(f"Columns: {list(full_df.columns)}")

    if "generation" not in full_df.columns or "model" not in full_df.columns:
        raise ValueError("Expected columns: generation, model. Check the CSV file.")

    print("\nDATASET SIZE INFO")
    print(f"Full dataset size: {len(full_df)}")

    # Split into train/test
    train_df, test_df = train_test_split(
        full_df,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=full_df["model"]
    )

    print(f"Train size BEFORE filtering: {len(train_df)}")
    print(f"Test size BEFORE filtering: {len(test_df)}")

    # Filter to specific models if specified
    if MODELS is not None:
        train_df = train_df[train_df["model"].isin(MODELS)]
        test_df = test_df[test_df["model"].isin(MODELS)]
        print(f"\nUsing models: {MODELS}")

    print(f"Train size AFTER filtering: {len(train_df)}")
    print(f"Test size AFTER filtering: {len(test_df)}")

    train_texts = train_df["generation"].astype(str)
    train_labels = train_df["model"]

    test_texts = test_df["generation"].astype(str)
    test_labels = test_df["model"]

    print("\nTRAIN label distribution:")
    print(train_labels.value_counts())

    print("\nTEST label distribution:")
    print(test_labels.value_counts())

    return train_df, test_df, train_texts, test_texts, train_labels, test_labels

# Load the data
train_df, test_df, train_texts, test_texts, train_labels, test_labels = load_and_prepare_data()

Loading dataset from: /content/drive/MyDrive/train_none.csv
Successfully loaded 467985 rows
Columns: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']

DATASET SIZE INFO
Full dataset size: 467985
Train size BEFORE filtering: 374388
Test size BEFORE filtering: 93597

Using models: ['cohere-chat', 'gpt4', 'mistral-chat', 'mpt-chat', 'llama-chat']
Train size AFTER filtering: 171149
Test size AFTER filtering: 42787

TRAIN label distribution:
model
mpt-chat        42787
mistral-chat    42787
llama-chat      42787
gpt4            21394
cohere-chat     21394
Name: count, dtype: int64

TEST label distribution:
model
mistral-chat    10697
mpt-chat        10697
llama-chat      10697
cohere-chat      5348
gpt4             5348
Name: count, dtype: int64


## Feature Extraction and CSV Export

In [None]:
def extract_and_save_features(train_texts, test_texts, train_labels, test_labels):
    """Extract features and save to CSV"""
    features_path = FEATURES_CSV

    if os.path.exists(features_path):
        print(f"Loading existing features from {features_path}")
        features_df = pd.read_csv(features_path)

        # Split back into train/test
        train_size = len(train_texts)
        X_train_num = features_df.iloc[:train_size].drop(['label', 'split'], axis=1)
        X_test_num = features_df.iloc[train_size:].drop(['label', 'split'], axis=1)
    else:
        print("Extracting features...")
        X_train_num = build_feature_matrix(train_texts)
        X_test_num = build_feature_matrix(test_texts)

        # Combine and save
        print(f"Saving features to {features_path}")
        X_train_num['label'] = train_labels.values
        X_train_num['split'] = 'train'
        X_test_num['label'] = test_labels.values
        X_test_num['split'] = 'test'

        features_df = pd.concat([X_train_num, X_test_num], axis=0)
        features_df.to_csv(features_path, index=False)

        # Remove temporary columns
        X_train_num = X_train_num.drop(['label', 'split'], axis=1)
        X_test_num = X_test_num.drop(['label', 'split'], axis=1)

    print(f"Feature count: {X_train_num.shape[1]}")
    print(f"Train features shape: {X_train_num.shape}")
    print(f"Test features shape: {X_test_num.shape}")

    return X_train_num, X_test_num

# Extract features
X_train_num, X_test_num = extract_and_save_features(
    train_texts, test_texts, train_labels, test_labels
)

Extracting features...
Extracting features from 171149 texts...
Extracting features from 42787 texts...
Saving features to extracted_features.csv
Feature count: 41
Train features shape: (171149, 41)
Test features shape: (42787, 41)


## TF-IDF Feature Combination

In [None]:
def build_tfidf_features(train_texts, test_texts, X_train_num, X_test_num):
    """Build and combine TF-IDF features with numerical features"""
    tfidf_path = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
    train_tfidf_path = os.path.join(MODEL_DIR, "X_train_tfidf.npz")
    test_tfidf_path = os.path.join(MODEL_DIR, "X_test_tfidf.npz")

    if os.path.exists(tfidf_path) and os.path.exists(train_tfidf_path):
        print("Loading saved TF-IDF features...")
        tfidf = joblib.load(tfidf_path)
        X_train_tfidf = load_npz(train_tfidf_path)
        X_test_tfidf = load_npz(test_tfidf_path)
    else:
        print("Building TF-IDF features...")
        tfidf = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.9
        )
        X_train_tfidf = tfidf.fit_transform(train_texts)
        X_test_tfidf = tfidf.transform(test_texts)

        # Save TF-IDF
        joblib.dump(tfidf, tfidf_path)
        save_npz(train_tfidf_path, X_train_tfidf)
        save_npz(test_tfidf_path, X_test_tfidf)

    # Convert numerical features to sparse
    X_train_num_sp = csr_matrix(X_train_num.values)
    X_test_num_sp = csr_matrix(X_test_num.values)

    # Combine TF-IDF with numerical features
    X_train_combined = hstack([X_train_tfidf, X_train_num_sp], format="csr")
    X_test_combined = hstack([X_test_tfidf, X_test_num_sp], format="csr")

    print(f"TF-IDF train shape: {X_train_tfidf.shape}")
    print(f"Combined train shape: {X_train_combined.shape}")

    return X_train_combined, X_test_combined, tfidf

# Build TF-IDF features
X_train, X_test, tfidf = build_tfidf_features(
    train_texts, test_texts, X_train_num, X_test_num
)

Building TF-IDF features...
TF-IDF train shape: (171149, 5000)
Combined train shape: (171149, 5041)


## Label Encoding

In [None]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

print("\nClasses:")
print(list(le.classes_))
print(f"Number of classes: {len(le.classes_)}")

# Save label encoder
joblib.dump(le, os.path.join(MODEL_DIR, "label_encoder.pkl"))


Classes:
['cohere-chat', 'gpt4', 'llama-chat', 'mistral-chat', 'mpt-chat']
Number of classes: 5


['saved_models/label_encoder.pkl']

## Traditional ML Models

In [None]:
def train_or_load_model(model_name, model_class, model_params, X_train, y_train):
    """Train a model or load from disk if it exists"""
    model_path = os.path.join(MODEL_DIR, f"{model_name}.pkl")

    if os.path.exists(model_path):
        print(f"Loading {model_name} from {model_path}")
        model = joblib.load(model_path)
    else:
        print(f"Training {model_name}...")
        model = model_class(**model_params)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)
        print(f"Saved {model_name} to {model_path}")

    return model

# Dictionary to store all models and predictions
models = {}
predictions = {}
probabilities = {}

# Random Forest
print("\n" + "="*50)
print("RANDOM FOREST")
print("="*50)
models['RandomForest'] = train_or_load_model(
    'random_forest',
    RandomForestClassifier,
    {'n_estimators': 200, 'random_state': RANDOM_STATE, 'n_jobs': -1},
    X_train, y_train
)
predictions['RandomForest'] = models['RandomForest'].predict(X_test)
probabilities['RandomForest'] = models['RandomForest'].predict_proba(X_test)

print(f"Accuracy: {accuracy_score(y_test, predictions['RandomForest']):.4f}")
print(f"Macro F1: {f1_score(y_test, predictions['RandomForest'], average='macro'):.4f}")

# XGBoost
print("\n" + "="*50)
print("XGBOOST")
print("="*50)
models['XGBoost'] = train_or_load_model(
    'xgboost',
    XGBClassifier,
    {
        'n_estimators': 300,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'multi:softmax',
        'num_class': len(le.classes_),
        'eval_metric': 'mlogloss',
        'random_state': RANDOM_STATE
    },
    X_train, y_train
)
predictions['XGBoost'] = models['XGBoost'].predict(X_test)
probabilities['XGBoost'] = models['XGBoost'].predict_proba(X_test)

print(f"Accuracy: {accuracy_score(y_test, predictions['XGBoost']):.4f}")
print(f"Macro F1: {f1_score(y_test, predictions['XGBoost'], average='macro'):.4f}")

# LightGBM
print("\n" + "="*50)
print("LIGHTGBM")
print("="*50)
models['LightGBM'] = train_or_load_model(
    'lightgbm',
    lgb.LGBMClassifier,
    {
        'n_estimators': 300,
        'learning_rate': 0.1,
        'num_leaves': 31,
        'random_state': RANDOM_STATE,
        'verbose': -1
    },
    X_train, y_train
)
predictions['LightGBM'] = models['LightGBM'].predict(X_test)
probabilities['LightGBM'] = models['LightGBM'].predict_proba(X_test)

print(f"Accuracy: {accuracy_score(y_test, predictions['LightGBM']):.4f}")
print(f"Macro F1: {f1_score(y_test, predictions['LightGBM'], average='macro'):.4f}")

# Logistic Regression
print("\n" + "="*50)
print("LOGISTIC REGRESSION")
print("="*50)
models['LogisticRegression'] = train_or_load_model(
    'logistic_regression',
    LogisticRegression,
    {
        'max_iter': 1000,
        'random_state': RANDOM_STATE,
        'n_jobs': -1
    },
    X_train, y_train
)
predictions['LogisticRegression'] = models['LogisticRegression'].predict(X_test)
probabilities['LogisticRegression'] = models['LogisticRegression'].predict_proba(X_test)

print(f"Accuracy: {accuracy_score(y_test, predictions['LogisticRegression']):.4f}")
print(f"Macro F1: {f1_score(y_test, predictions['LogisticRegression'], average='macro'):.4f}")


RANDOM FOREST
Training random_forest...
Saved random_forest to saved_models/random_forest.pkl
Accuracy: 0.7786
Macro F1: 0.7768

XGBOOST
Training xgboost...
Saved xgboost to saved_models/xgboost.pkl
Accuracy: 0.8278
Macro F1: 0.8300

LIGHTGBM
Training lightgbm...
Saved lightgbm to saved_models/lightgbm.pkl




Accuracy: 0.8618
Macro F1: 0.8656

LOGISTIC REGRESSION
Training logistic_regression...
Saved logistic_regression to saved_models/logistic_regression.pkl
Accuracy: 0.4920
Macro F1: 0.4480
