# Load Libraries

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

from sklearn.model_selection import train_test_split
import pandas as pd
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm

tqdm.pandas()

train_df = pd.read_csv('/kaggle/input/artificial-text-detection-homework/dev.csv')

train_X = train_df['Text']
train_y = train_df['Class'].apply(lambda x: 1 if x == 'M' else 0)



# Load The DistilBert for classification

In [3]:
# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# Load the DistilBert model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare Data

In [2]:
# Assuming train_X and train_y are your features and labels
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding = 'max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}

In [5]:
# Create the datasets
train_dataset = SentimentDataset(X_train.to_list(), y_train.tolist(), tokenizer, 512)
valid_dataset = SentimentDataset(X_val.to_list(), y_val.to_list(), tokenizer, 512)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

# Compile The Model

In [6]:
# Set up the loss function and optimizer
loss_fn = BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set up the learning rate scheduler
total_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Train (Finetune) DistilBertForClassification

In [7]:
max_epochs_without_improvement = 50
early_stopping_counter = 0
best_val_loss = float('inf')

for epoch in range(100):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss = outputs.loss

    print(f"Epoch: {epoch}, Loss:  {loss.item()}, Validation Loss:  {val_loss.item()}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, "best_model.pth")
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= max_epochs_without_improvement:
        print(f"Early stopping at epoch {epoch} as validation loss did not improve for {max_epochs_without_improvement} consecutive epochs.")
        break

100%|██████████| 100/100 [00:44<00:00,  2.24it/s]


Epoch: 0, Loss:  0.06303806602954865, Validation Loss:  0.027110636234283447


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 1, Loss:  0.004777418449521065, Validation Loss:  0.01609455794095993


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 2, Loss:  0.005320049822330475, Validation Loss:  0.008808588609099388


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 3, Loss:  0.004376300144940615, Validation Loss:  0.0588519461452961


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 4, Loss:  0.004900057800114155, Validation Loss:  0.0015644796658307314


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 5, Loss:  0.008449956774711609, Validation Loss:  0.07390110194683075


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 6, Loss:  0.002575397491455078, Validation Loss:  0.05028006434440613


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 7, Loss:  0.0013817816507071257, Validation Loss:  0.0004626185691449791


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 8, Loss:  0.0023199047427624464, Validation Loss:  0.0007691933424212039


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 9, Loss:  0.0023950624745339155, Validation Loss:  0.0008488741586916149


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 10, Loss:  0.0030481191352009773, Validation Loss:  0.0008826056146062911


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 11, Loss:  0.009705883450806141, Validation Loss:  0.0017815993633121252


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 12, Loss:  0.001971648773178458, Validation Loss:  0.010741619393229485


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 13, Loss:  0.0032216475810855627, Validation Loss:  0.0009874040260910988


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 14, Loss:  0.002611996605992317, Validation Loss:  0.0005999378627166152


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


Epoch: 15, Loss:  0.0036652041599154472, Validation Loss:  0.014932917430996895


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 16, Loss:  0.0028073368594050407, Validation Loss:  0.0012263451935723424


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Epoch: 17, Loss:  0.0022739688865840435, Validation Loss:  0.013907439075410366
Early stopping at epoch 17 as validation loss did not improve for 10 consecutive epochs.


# Predict on test

In [25]:
model_name = 'distilbert-base-uncased'
model_path = '/kaggle/working/best_model.pth'
# Define the model architecture
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels = 1)

# Load the trained model weights
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model_state_dict'], strict=False)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [26]:
def model_predit(text): 
    text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding='max_length').to(device)
#   get prediction
    predicted = model(**text).logits.item()
    
    return 'M' if int(predicted) == 1 else 'H'

In [27]:
df_test = pd.read_csv('/kaggle/input/artificial-text-detection-homework/test.csv')
df_test['Class'] = df_test['Text'].progress_apply(model_predit)

100%|██████████| 20000/20000 [03:58<00:00, 83.97it/s]


In [28]:
submission_df = pd.DataFrame({'ID': df_test['ID'], 'Class': df_test['Class']})
submission_df.to_csv('submission.csv', index = False)

# Upvote and Comment if you like this notebook😉