In [None]:
#This notebook is by Anastasia Ruzmaikina for Kaggle Competition LLM - Detect AI Generated Text

In recent years, large language models (LLMs) have become increasingly sophisticated, capable of generating text that is difficult to distinguish from human-written text. In this competition, we hope to foster open research and transparency on AI detection techniques applicable in the real world.

This competition challenges participants to develop a machine learning model that can accurately detect whether an essay was written by a student or an LLM. The competition dataset comprises a mix of student-written essays and essays generated by a variety of LLMs.

Can you help build a model to identify which essay was written by middle and high school students, and which was written using a large language model? With the spread of LLMs, many people fear they will replace or alter work that would usually be done by humans. Educators are especially concerned about their impact on students’ skill development, though many remain optimistic that LLMs will ultimately be a useful tool to help students improve their writing skills.

At the forefront of academic concerns about LLMs is their potential to enable plagiarism. LLMs are trained on a massive dataset of text and code, which means that they are able to generate text that is very similar to human-written text. For example, students could use LLMs to generate essays that are not their own, missing crucial learning keystones. Your work on this competition can help identify telltale LLM artifacts and advance the state of the art in LLM text detection. By using texts of moderate length on a variety of subjects and multiple, unknown generative models, we aim to replicate typical detection scenarios and incentivize learning features that generalize across models.



In this notebook I use Microsoft Deberta-V3-Small to classify the essays as AI written or human written. The accuracy score for this notebook is 57%

In [10]:
#!pip install gdown
#import gdown
#url = 'https://drive.google.com/uc?id=1-7aCHayx5r2UlYEPHSlH9jjXy1nqgWzA'
#output = 'modelnew.h5'
#url = 'https://drive.google.com/uc?id=
#output =
#gdown.download(url,output, quiet=False)
! pip install -q datasets

#tokz = AutoTokenizer.from_pretrained(model_nm)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import requests
from keras.activations import softmax

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
model_nm = '/kaggle/input/microsoftdeberta-v3-small'
df_train1 = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
#df_train1 = df_train1.loc[df_train1['prompt_id'] == 0]
print(df_train1)
df_train2 = pd.read_csv("/kaggle/input/generated2/generated.csv")
df_train = pd.concat([df_train1, df_train2], axis=0)
df_test = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
df_prompt = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
sample_submission = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv")
from datasets import load_dataset
#ds = load_dataset('csv', data_files=['/kaggle/input/llm-detect-ai-generated-text/train_essays.csv', '/kaggle/input/generated2/generated.csv', '/kaggle/input/generated1/generated1.csv'])
#print(ds)
print(len(df_train))
print(len(df_test))
print(df_prompt)
print(sample_submission)
df_train["text"] = df_train["text"].str.lower()
#df_train.set_index('id', inplace=True)
#df_train['generated'] = df_train['generated'].map({1:'yes', 0:'no'})
df_train['prompt_id'] = df_train['prompt_id'].map({1:'electoral', 0:'cars'})
df_train["text"] = df_train["text"].str.replace("#", "")
df_test["text"] = df_test["text"].str.replace("#", "" )
df_test["text"] = df_test["text"].str.lower()
#df_test['generated_text'] = df_test['generated'].map({1:'yes', 0:'no'})
df_test['prompt_id'] = df_test['prompt_id'].map({1:'electoral', 0:'cars'})
#df_test.set_index('id', inplace=True)
print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))
print(df_train.head())
print(df_test.head())
print(df_train["text"])


from sklearn.model_selection import train_test_split
from datasets import Dataset,DatasetDict
df_train['input'] = df_train.text #'TEXT: ' + df_train.text + ';  ANC: '+ df_train.prompt_id# + '; ANC2: '+ df_train.id
#TEXT2: ' + df_train.generated + ';
df_train1 = df_train.drop(['id'], axis=1)
ds = Dataset.from_pandas(df_train1)
print(ds)

#!pip install --upgrade transformers
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
#from transformers import DebertaV3Model
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
#model_nm = '/kaggle/input/debertav3small'
tokz = AutoTokenizer.from_pretrained(model_nm)
def tok_func(x): return tokz(x["input"])
tok_ds = ds.map(tok_func, batched=True)
tok_ds = tok_ds.rename_columns({'generated':'labels'})
dds = tok_ds.train_test_split(0.15, seed=420)
print(dds)
df_test['input'] = df_test.text #'TEXT: ' + df_test.text + '; ANC: ' + df_test.prompt_id#+ '; ANC2: '+ df_test.id
df_test1 = df_test.drop(['id'], axis=1)
eval_ds = Dataset.from_pandas(df_test1).map(tok_func, batched=True)
bs = 1
epochs = 2
lr = 4.15e-6
#classifier = tf.keras.models.load_model('/kaggle/working/modelnew.h5')
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz)#, compute_metrics=compute_metrics)
trainer.train();
#pipe = TextClassificationPipeline(model=model, tokenizer=tokz)
#prediction = pipe("The text to predict", return_all_scores=True)
#print(prediction)
preds = trainer.predict(eval_ds).predictions.astype(float)
print(preds)
preds = np.clip(preds, 0, 1)

# Make predictions
#predictions = classifier.predict(X_test)

# Evaluate the model (optional)
#classifier.evaluate(X_test)
submission = df_test.id.copy().to_frame()
submission["generated"] = np.argmax(preds, axis=1)#classifier.predict(X_test)
#submission["generated"] = submission["generated"].round(1)
submission.to_csv("/kaggle/working/submission.csv", index=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TensorFlow version: 2.13.0
KerasNLP version: 0.7.0.dev3
/kaggle/input/generated-new/generatednew.csv
/kaggle/input/generated2/generated.csv
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/config.json
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/pytorch_model-00002-of-00002.bin
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/tokenizer.json
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/tokenizer_config.json
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/pytorch_model.bin.index.json
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/pytorch_model-00001-of-00002.bin
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/special_tokens_map.json
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/tokenizer.model
/kaggle/input/llama2-7b-hf/Llama2-7b-hf/generation_config.json
/kaggle/input/microsoftdeberta-v3-small/spm.model
/kaggle/input/microsoftdeberta-v3-small/config.json
/kaggle/input/microsoftdeberta-v3-small/README.md
/kaggle/input/microsoftdeberta-v3-small/tf_model.h5
/kaggle/input/microsoftdeberta-v3-small/tokenizer_config.json
/kaggle/input/microsoftdebe



  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['prompt_id', 'text', 'labels', 'input', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2437
    })
    test: Dataset({
        features: ['prompt_id', 'text', 'labels', 'input', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 431
    })
})


  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/microsoftdeberta-v3-small and are newly initialized: ['pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0445,0.000502
2,0.0001,0.017202


[[-0.91503906  1.14355469]
 [-0.82226562  1.03125   ]
 [-0.86914062  1.08496094]]


In [12]:
import os

def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

folder_path = '/kaggle/working'
#remove_folder_contents(folder_path)
#os.rmdir(folder_path)