## Import and EDA

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')



In [2]:
path = '/kaggle/input/llm-detect-ai-generated-text'

In [3]:
train_essay = pd.read_csv(f"{path}/train_essays.csv")
test_essay = pd.read_csv(f"{path}/test_essays.csv")
train_prompts = pd.read_csv(f"{path}/train_prompts.csv")

In [4]:
train_essay.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [5]:
train_essay['generated'].value_counts()

generated
0    1375
1       3
Name: count, dtype: int64

In [6]:
train_essay.generated = train_essay.generated.astype('float')

In [7]:
test_essay.head()

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [8]:
train_prompts.head()

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [9]:
train_essay.shape,train_prompts.shape, test_essay.shape

((1378, 4), (2, 4), (3, 3))

In [10]:
df = pd.merge(train_essay, train_prompts, on="prompt_id")

In [11]:
df.head()

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text
0,0059830c,0,Cars. Cars have been around since they became ...,0.0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,005db917,0,Transportation is a large necessity in most co...,0.0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0.0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0.0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0.0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."


In [12]:
df.describe(include="object")

Unnamed: 0,id,text,prompt_name,instructions,source_text
count,1378,1378,1378,1378,1378
unique,1378,1378,2,2,2
top,0059830c,Cars. Cars have been around since they became ...,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
freq,1,1,708,708,708


In [13]:
# df['input'] = 'TEXT: ' + df['text'] + '; PROMPT_NAME: ' + df['prompt_name'] + '; PROMPT_INSTRUCTION: ' + df['instructions'] + '; PROMPT_SOURCE: ' + df['source_text']
df['input'] = 'TEXT: ' + df['text'] 

## Tokenization

A dataset object is used for storing datasets in transformers.

In [14]:
!pip install datasets



In [15]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df)

In [16]:
ds

Dataset({
    features: ['id', 'prompt_id', 'text', 'generated', 'prompt_name', 'instructions', 'source_text', 'input'],
    num_rows: 1378
})

In [17]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

0

In [18]:
model_nm = 'microsoft/deberta-v3-small'

In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tok = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tok.tokenize('Welcome to my notebook!')

['▁Welcome', '▁to', '▁my', '▁notebook', '!']

In [21]:
# A simple function to tokenize input
def tokenize_input(data): return tok(data["input"])

In [22]:
# map runs the tokenize function to every row in the dataset in parallel
tok_ds = ds.map(tokenize_input, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [23]:
row = tok_ds[0]
# Uncomment the following line to look at the input ids
# row['input'], row['input_ids']

In [24]:
tok.vocab['cars']

34103

In [25]:
tok_ds

Dataset({
    features: ['id', 'prompt_id', 'text', 'generated', 'prompt_name', 'instructions', 'source_text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1378
})

In [26]:
# Transformers expects to have the labels to have the column name of 'labels'
tok_ds = tok_ds.rename_columns({'generated': 'labels'})

In [27]:
tok_ds

Dataset({
    features: ['id', 'prompt_id', 'text', 'labels', 'prompt_name', 'instructions', 'source_text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1378
})

## Splittind the data into test and validation sets

In [28]:
eval_df = test_essay.copy()

In [29]:
eval_df.describe(include="object")

Unnamed: 0,id,text
count,3,3
unique,3,3
top,0000aaaa,Aaa bbb ccc.
freq,1,1


In [30]:
dds = tok_ds.train_test_split(0.3, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels', 'prompt_name', 'instructions', 'source_text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 964
    })
    test: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels', 'prompt_name', 'instructions', 'source_text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 414
    })
})

In [31]:
eval_df['input'] = 'TEXT: ' + eval_df['text'] 

In [32]:
eval_df.head(2)

Unnamed: 0,id,prompt_id,text,input
0,0000aaaa,2,Aaa bbb ccc.,TEXT: Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.,TEXT: Bbb ccc ddd.


In [33]:
eval_ds = Dataset.from_pandas(eval_df).map(tokenize_input, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
eval_ds

Dataset({
    features: ['id', 'prompt_id', 'text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3
})

## Training our model

In [35]:
from transformers import TrainingArguments, Trainer

In [36]:
batch_size=2
epochs=2
learning_rate=2e-5

In [37]:
args = TrainingArguments('outputs', 
                         learning_rate=learning_rate, 
                         warmup_ratio=0.1, 
                         lr_scheduler_type='cosine', 
                         fp16=True,
                         evaluation_strategy="epoch", 
                         per_device_train_batch_size=batch_size, 
                         per_device_eval_batch_size=batch_size*2,
                         num_train_epochs=epochs, 
                         weight_decay=0.01, 
                         report_to='none', 
                         metric_for_best_model="roc_auc")

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions

    # If using softmax or logits, select the probabilities for the positive class.
    # For instance, if it's binary classification and the positive class is class 1:
    if len(preds.shape) > 1 and preds.shape[1] > 1:
        # For binary classification, the second column represents the positive class.
        preds = preds[:, 1]

    # Calculate AUC-ROC
    auc = metrics.roc_auc_score(labels, preds)
    
    return {"auc": auc}

In [40]:
trainer = Trainer(model, 
                  args, 
                  train_dataset=dds['train'], 
                  eval_dataset=dds['test'],
                  tokenizer=tok, 
                  compute_metrics=compute_metrics
                 )

In [41]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc
1,No log,0.004801,0.897451
2,0.006700,0.004764,0.997573


TrainOutput(global_step=964, training_loss=0.005091707365146811, metrics={'train_runtime': 238.2834, 'train_samples_per_second': 8.091, 'train_steps_per_second': 4.046, 'total_flos': 382431902978880.0, 'train_loss': 0.005091707365146811, 'epoch': 2.0})

In [42]:
predictions = trainer.predict(eval_ds)

In [43]:
predictions

PredictionOutput(predictions=array([[0.01020813],
       [0.01004028],
       [0.00940704]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.0221, 'test_samples_per_second': 136.023, 'test_steps_per_second': 45.341})

In [44]:
submission = pd.DataFrame()
submission['id'] = test_essay['id']
submission['generated'] = predictions.predictions
submission.to_csv('submission.csv', index=False)

In [45]:
submission

Unnamed: 0,id,generated
0,0000aaaa,0.010208
1,1111bbbb,0.01004
2,2222cccc,0.009407
