In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_auc_score
from transformers import AutoModel

In [2]:
def get_activation(name):
    name = name.lower()
    if name == "relu":
        return nn.ReLU()
    elif name == "gelu":
        return nn.GELU()
    elif name == "leakyrelu":
        return nn.LeakyReLU(negative_slope=0.1)
    elif name == "tanh":
        return nn.Tanh()
    elif name == "elu":
        return nn.ELU()
    else:
        raise ValueError(f"Activation inconnue : {name}")

class DistilRoBERTaWithFeatures(nn.Module):
    def __init__(
        self,
        text_model_name: str = "distilroberta-base",
        num_features: int = 38,
        hidden_sizes: list[int] = [256], # [32]
        activations: list[str] = ["leakyrelu"], # ["leakyrelu"]
        dropout_rates: list[float] = [0.2], # [0.2]
    ):
        super().__init__()
        self.text_model = AutoModel.from_pretrained(text_model_name)
        self.backbone_dropout = nn.Dropout(0.1)

        dims = [self.text_model.config.hidden_size + num_features] + hidden_sizes + [1]
        layers: list[nn.Module] = []
        for i in range(len(dims) - 1):
            in_dim, out_dim = dims[i], dims[i + 1]
            layers.append(nn.Linear(in_dim, out_dim))
            if i < len(hidden_sizes):
                layers.append(get_activation(activations[i]))
                layers.append(nn.Dropout(dropout_rates[i]))
        self.classifier = nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, features):
        out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = out.last_hidden_state[:, 0, :]   # [CLS]
        cls_emb = self.backbone_dropout(cls_emb)

        x = torch.cat([cls_emb, features], dim=-1)
        logits = self.classifier(x) # [batch_size]
        return logits


In [3]:
# import sys
# sys.path.append("../scripts/")
# from hybrid_distilroberta_model import DistilRoBERTaWithFeatures

In [4]:
# Get Data
data_path = '../data/with_features/'
train_data = pd.read_csv(f'{data_path}train.csv')
test_data = pd.read_csv(f'{data_path}test.csv')

## `Human vs Mix`

In [5]:
feature_cols = ['num_characters', 'word_count', 'sentence_count',
                'mean_sentence_length', 'burstiness', 'stop_words_ratio',
                'vocabulary_size', 'ttr', 'tfidf_method', 'tfidf_approach',
                'tfidf_proposed', 'tfidf_paper', 'tfidf_study', 'tfidf_analysis',
                'tfidf_using', 'tfidf_application', 'tfidf_potential',
                'tfidf_performance', 'tfidf_network', 'tfidf_algorithm',
                'tfidf_feature', 'tfidf_learning', 'tfidf_data', 'tfidf_model',
                'tfidf_control', 'tfidf_information', 'tfidf_accuracy',
                'tfidf_technique', 'flesch', 'noun_prop', 'det_prop', 'adj_prop',
                'aux_prop', 'pron_prop', 'adv_prop', 'punct_prop', 'adp_prop', 'ppl']

In [6]:
# Loading data
# Keep only human and mix labels
Z_train = train_data[train_data['label'].isin(['human', 'mix'])].copy()
Z_test = test_data[test_data['label'].isin(['human', 'mix'])].copy()

Z_train['label'] = Z_train['label'].map({'human': 0, 'mix': 1})
Z_test['label'] = Z_test['label'].map({'human': 0, 'mix': 1})

# Split the data into human and mix
train_h = Z_train[Z_train['label'] == 0]
train_f = Z_train[Z_train['label'] == 1]
test_h  = Z_test[Z_test['label']  == 0]
test_f  = Z_test[Z_test['label']  == 1]

# Number of samples in the minority class
n_train = min(len(train_h), len(train_f))
n_test  = min(len(test_h),  len(test_f))

# Undersample the majority class
train_h_down = train_h.sample(n_train, replace=False, random_state=42)
train_f_down = train_f.sample(n_train, replace=False, random_state=42)
test_h_down  = test_h.sample(n_test,  replace=False, random_state=42)
test_f_down  = test_f.sample(n_test,  replace=False, random_state=42)

# Concatenate the undersampled data
Z_train = pd.concat([train_h_down, train_f_down], axis=0)
Z_test  = pd.concat([test_h_down, test_f_down], axis=0)

Z_train = Z_train[['abstract'] + feature_cols + ['label']].reset_index(drop=True)
Z_test = Z_test[['abstract'] + feature_cols + ['label']].reset_index(drop=True)

print(f"Train shape: {Z_train.shape}")
print(f"Test shape: {Z_test.shape}")

Train shape: (7222, 40)
Test shape: (1806, 40)


In [7]:
# number of data per class
print(f"Train human: {len(Z_train[Z_train['label'] == 0])}")
print(f"Train mix: {len(Z_train[Z_train['label'] == 1])}")
print(f"Test human: {len(Z_test[Z_test['label'] == 0])}")
print(f"Test mix: {len(Z_test[Z_test['label'] == 1])}")

Train human: 3611
Train mix: 3611
Test human: 903
Test mix: 903


In [8]:
# Scale features
scaler = StandardScaler()
X_train_feat = scaler.fit_transform(Z_train[feature_cols])
X_test_feat = scaler.transform(Z_test[feature_cols])

In [9]:
# Tokenize abstracts
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

train_enc = tokenizer(Z_train["abstract"].tolist(), truncation=True, padding=True, return_tensors="pt")
test_enc = tokenizer(Z_test["abstract"].tolist(), truncation=True, padding=True, return_tensors="pt")

In [10]:
# Dataset class
class HybridDataset(Dataset):
    def __init__(self, encodings, features, labels):
        self.encodings = encodings
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'features': self.features[idx],
            'labels': self.labels[idx]
        }

In [11]:
# Datasets and dataloaders
train_dataset = HybridDataset(train_enc, X_train_feat, Z_train["label"])
test_dataset = HybridDataset(test_enc, X_test_feat, Z_test["label"])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [12]:
# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = DistilRoBERTaWithFeatures(num_features=len(feature_cols)).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

Using device: cuda


  backends.update(_get_backends("networkx.backends"))


In [13]:
# Hyperparameters
num_epochs = 5
best_auc = 0.0
best_model_state = None

In [14]:
# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        features = batch["features"].to(device)
        labels = batch["labels"].unsqueeze(1).to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, features)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")
    
    #Evaluation
    model.eval()
    all_probs = []
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            features = batch["features"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask, features)
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs > 0.5).astype(int)

            all_probs.extend(probs)
            all_preds.extend(preds)
            all_targets.extend(labels.cpu().numpy())
    
    epoch_auc = roc_auc_score(all_targets, all_probs)
    epoch_acc = accuracy_score(all_targets, all_preds)
    print(f"Epoch {epoch+1} - Test Accuracy: {epoch_acc:.4f} | Test AUC: {epoch_auc:.4f}")

    # Save the best model based on AUC
    if epoch_auc > best_auc:
        best_auc = epoch_auc
        best_model_state = model.state_dict()  # ou torch.save(model.state_dict(), "best_model.pt")
        print(f"--> New best model saved (AUC = {best_auc:.4f})")


Epoch 1/5


Training: 100%|██████████| 113/113 [01:39<00:00,  1.13it/s]


Epoch 1 - Training Loss: 0.6767


Evaluating: 100%|██████████| 29/29 [00:07<00:00,  3.65it/s]


Epoch 1 - Test Accuracy: 0.6866 | Test AUC: 0.7497
--> New best model saved (AUC = 0.7497)

Epoch 2/5


Training: 100%|██████████| 113/113 [01:41<00:00,  1.11it/s]


Epoch 2 - Training Loss: 0.5837


Evaluating: 100%|██████████| 29/29 [00:07<00:00,  3.64it/s]


Epoch 2 - Test Accuracy: 0.6949 | Test AUC: 0.7990
--> New best model saved (AUC = 0.7990)

Epoch 3/5


Training: 100%|██████████| 113/113 [01:41<00:00,  1.11it/s]


Epoch 3 - Training Loss: 0.4866


Evaluating: 100%|██████████| 29/29 [00:07<00:00,  3.64it/s]


Epoch 3 - Test Accuracy: 0.7375 | Test AUC: 0.8238
--> New best model saved (AUC = 0.8238)

Epoch 4/5


Training: 100%|██████████| 113/113 [01:41<00:00,  1.11it/s]


Epoch 4 - Training Loss: 0.3773


Evaluating: 100%|██████████| 29/29 [00:07<00:00,  3.64it/s]


Epoch 4 - Test Accuracy: 0.7331 | Test AUC: 0.8275
--> New best model saved (AUC = 0.8275)

Epoch 5/5


Training: 100%|██████████| 113/113 [01:41<00:00,  1.11it/s]


Epoch 5 - Training Loss: 0.2892


Evaluating: 100%|██████████| 29/29 [00:07<00:00,  3.64it/s]

Epoch 5 - Test Accuracy: 0.6866 | Test AUC: 0.8117





In [15]:
model.load_state_dict(best_model_state)
print(f"Training finished. Best AUC obtained on test: {best_auc:.4f}")

Training finished. Best AUC obtained on test: 0.8275


In [16]:
# Save model
torch.save(model.state_dict(), "../models/human_vs_mix/hybrid_model.pth")
# Save scaler
import joblib
joblib.dump(scaler, '../models/human_vs_mix/scaler_hybrid.pkl')
# Save tokenizer
tokenizer.save_pretrained('../models/human_vs_mix/tokenizer_hybrid')

('../models/human_vs_mix/tokenizer_hybrid/tokenizer_config.json',
 '../models/human_vs_mix/tokenizer_hybrid/special_tokens_map.json',
 '../models/human_vs_mix/tokenizer_hybrid/vocab.json',
 '../models/human_vs_mix/tokenizer_hybrid/merges.txt',
 '../models/human_vs_mix/tokenizer_hybrid/added_tokens.json',
 '../models/human_vs_mix/tokenizer_hybrid/tokenizer.json')