In [None]:
# !jupyter nbconvert DLM.ipynb --to python

**Imports**
---

In [None]:
import os
import glob
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

**Hparams**
---

In [None]:
BATCH_SIZE = 32
EPOCHS = 100
MAX_LENGTH = 512 # C_H
MODEL_NAME = "roberta-base" # C_H
num_unfrozen_layers = 1

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

PROMPING_MODEL_KEY = "mistral_7b_v03_instruct"
MARKING_MODEL_KEY = "mistral_7b_v03_instruct"
USING_MAX_DATASET = True # C_H

In [None]:
# # Fine-tuned
# MARKING_MODEL_KEY = f"fine_tuned_{MARKING_MODEL_KEY}"

In [None]:
experiment = f"{MAX_LENGTH}_{MODEL_NAME}" # C_H

if USING_MAX_DATASET:
    input_directory = f"Datasets/10k/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}_data"
    output_directory = f"DLM/10k/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}/{experiment}"
else:
    input_directory = f"Datasets/1k/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}_data"
    output_directory = f"DLM/1k/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}/{experiment}"

In [None]:
# # # Distillation
# TEACHER_MODEL_KEY = "llama3_8b_instruct"

# if USING_MAX_DATASET:
#     input_directory = f"Datasets/10k/PLM_{PROMPING_MODEL_KEY}/distilled_{TEACHER_MODEL_KEY}_to_{MARKING_MODEL_KEY}_data"
#     output_directory = f"DLM/10k/PLM_{PROMPING_MODEL_KEY}/distilled_{TEACHER_MODEL_KEY}_to_{MARKING_MODEL_KEY}/{experiment}"
# else:
#     input_directory = f"Datasets/1k/PLM_{PROMPING_MODEL_KEY}/distilled_{TEACHER_MODEL_KEY}_to_{MARKING_MODEL_KEY}_data"
#     output_directory = f"DLM/1k/PLM_{PROMPING_MODEL_KEY}/distilled_{TEACHER_MODEL_KEY}_to_{MARKING_MODEL_KEY}/{experiment}"

In [None]:
# Define the model names
MODEL_NAMES = {
    # Working PLM models
    "mistral_7b_v03_instruct": "mistralai/Mistral-7B-Instruct-v0.3",  #✅ Works
    
    # MLM models (teacher)
    "deepseek_llm_chat": "deepseek-ai/deepseek-llm-7b-chat",  # ✅ Works
    "qwen2.5_7b_instruct": "Qwen/Qwen2.5-7B-Instruct",  # ✅ Works
    "llama3_8b_instruct": "meta-llama/Meta-Llama-3-8B-Instruct",  # ✅ Works
    "gemma_7b_it": "google/gemma-7b-it",  # ✅ Works
    "ministral_8b_instruct": "mistralai/Ministral-8B-Instruct-2410",  #✅ Works
    "glm_4_9b_chat": "THUDM/glm-4-9b-chat",  # ✅ Works
    "internlm2.5_7b_chat": "internlm/internlm2-chat-7b",  # ✅ Works

    "gpt4o: manual"
    
    # Student models
    "mistral_7b_v02_instruct": "mistralai/Mistral-7B-Instruct-v0.2",  # ✅ Works
    "qwen1.5_1.8b_instruct": "Qwen/Qwen1.5-1.8B-Chat",  # ✅ Works
    "tinyllama_1.1b_chat": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # ✅ Works
    "gemma_1.1_2b_it": "google/gemma-1.1-2b-it",  # ✅ Works
}

**Special case for GPT-4o**
---

In [None]:
if MARKING_MODEL_KEY == "gpt4o":
    input_directory = "Datasets/GPT-4o"
    output_directory = f"DLM/300/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}/{experiment}"

**Special case for testing**
---

In [None]:
test_ablation = False
experiment = 'all'

test_attack = False
ATTACK_INDEX = 0

In [None]:
if test_ablation:
    input_directory = f"ablation/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}_data/{experiment}"
    output_directory = f"ablation/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}/{experiment}"
elif test_attack:
    input_directory = f"attack/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}_data/attack_{ATTACK_INDEX}"
    output_directory = f"attack/PLM_{PROMPING_MODEL_KEY}/{MARKING_MODEL_KEY}/attack_{ATTACK_INDEX}"

**Dataset**
---

In [None]:
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:
# Load and process dataset
all_files = glob.glob(os.path.join(input_directory, "*.csv"))
dfs = [pd.read_csv(f) for f in all_files]
df = pd.concat(dfs, ignore_index=True)

texts = df['NON-WATERMARKED RESPONSE'].tolist() + df['WATERMARKED RESPONSE'].tolist()
labels = [[0, 1]] * len(df) + [[1, 0]] * len(df)

# Tokenize
encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='tf')
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = tf.convert_to_tensor(labels, dtype=tf.float32)

# Shuffle the dataset (inputs and labels in sync)
dataset_size = input_ids.shape[0]
indices = tf.random.shuffle(tf.range(dataset_size))
input_ids = tf.gather(input_ids, indices)
attention_mask = tf.gather(attention_mask, indices)
labels = tf.gather(labels, indices)

# Split into train and validation sets
train_size = int(0.8 * dataset_size)
train_data = (input_ids[:train_size], attention_mask[:train_size], labels[:train_size])
val_data = (input_ids[train_size:], attention_mask[train_size:], labels[train_size:])

In [None]:
# Dataset
def create_dataset(inputs, masks, labels):
    return tf.data.Dataset.from_tensor_slices(((inputs, masks), labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
train_dataset = create_dataset(*train_data)
val_dataset = create_dataset(*val_data)

**MultiGPU strategy**
---

In [None]:
strategy = tf.distribute.MirroredStrategy()

**Model**
---

In [None]:
def build_detecting_llm_tf(MODEL_NAME, MAX_LENGTH=512):
    # Load model
    base_model = TFAutoModel.from_pretrained(MODEL_NAME)
    encoder_layers = base_model.roberta.encoder.layer
    if encoder_layers is not None:
        for i, layer in enumerate(encoder_layers):
            layer.trainable = i >= len(encoder_layers) - num_unfrozen_layers
    else:
        print(f"Warning: Unknown model structure for '{MODEL_NAME}', freezing all layers.")
        for layer in base_model.layers:
            layer.trainable = False

    # Define inputs
    input_ids_layer = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
    attention_mask_layer = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

    # CLS output → Dense head
    outputs = base_model(input_ids_layer, attention_mask=attention_mask_layer)[0][:, 0, :]
    logits = tf.keras.layers.Dense(2, activation='sigmoid')(outputs)

    # Final model
    model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=logits)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
with strategy.scope():
    model = build_detecting_llm_tf("roberta-base", MAX_LENGTH)

In [None]:
model.summary()

**Training**
---

In [None]:
class VerboseCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.best_val_acc = 0
        self.best_epoch = 0

    def on_epoch_end(self, epoch, logs=None):
        val_acc = logs.get("val_accuracy")
        if val_acc is not None and val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            self.best_epoch = epoch
            print(f"✅ New best val_accuracy: {val_acc:.4f} at epoch {epoch + 1}")
        else:
            print(f"val_accuracy did not improve from {self.best_val_acc:.4f}")

In [None]:
checkpoint_path = os.path.join(output_directory, "best_DLM_weights.h5")
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, monitor='val_accuracy',
                                       save_best_only=True, save_weights_only=True),
    VerboseCallback()
]

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=callbacks
)

**Plotting**
---

In [None]:
# Extract training and validation accuracy
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs_range = range(1, len(train_acc) + 1)

# Find best validation accuracy and corresponding epoch
best_epoch = int(np.argmax(val_acc))
best_val_acc = val_acc[best_epoch]

# Plot
plt.figure(figsize=(10, 6))
plt.plot(epochs_range, train_acc, label='Train Accuracy')
plt.plot(epochs_range, val_acc, label=f'Val Accuracy (best: {best_val_acc:.4f})')
plt.scatter(best_epoch + 1, best_val_acc, color='red', zorder=5)  # +1 to match epoch number (1-indexed)

# Labels and styling
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training and Validation Accuracy")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(output_directory, "accuracy_plot.png"))
plt.show()