# Dependencies

In [1]:
import pandas as pd

import torch
from tqdm import tqdm
from datetime import datetime
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW

import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

# Data

### Detect AI Generated Text data (https://www.kaggle.com/competitions/llm-detect-ai-generated-text/data)
### DAIGT Proper Train Dataset (https://www.kaggle.com/datasets/thedrcat/daigt-proper-train-dataset/data)
### DAIGT External Dataset (https://www.kaggle.com/datasets/alejopaullier/daigt-external-dataset)
### ArguGPT (https://arxiv.org/abs/2304.07666)
### The Imitation Game (https://arxiv.org/abs/2307.12166)
### artem9k/ai-text-detection-pile Dataset (https://huggingface.co/datasets/artem9k/ai-text-detection-pile)

In [2]:
import pandas as pd

In [3]:
# Detect AI Generated Text data 
df_train_essays = pd.read_csv("./data/train_essays.csv")
df_train_essays.drop_duplicates(inplace=True)
df_train_essays.dropna(subset=['text'], inplace=True)

# DAIGT Proper Train Dataset
df_train_drcat_04 = pd.read_csv("./data_DAIGT/train_drcat_04.csv")
df_train_drcat_04.rename(columns = {"label":"generated"}, inplace=True)
df_train_drcat_04.drop_duplicates(inplace=True)
df_train_drcat_04.dropna(subset=['text'], inplace=True)

# DAIGT External Dataset
df_train_drcat_ext = pd.read_csv("./data_DAIGT/daigt_external_dataset.csv")
df_human = df_train_drcat_ext[['text']].rename(columns={'text': 'text'})
df_human['generated'] = 0
df_ai = df_train_drcat_ext[['source_text']].rename(columns={'source_text': 'text'})
df_ai['generated'] = 1
df_train_drcat_ext = pd.concat([df_human, df_ai], ignore_index=True) # concatenate
df_train_drcat_ext.drop_duplicates(inplace=True)
df_train_drcat_ext.dropna(subset=['text'], inplace=True)

# ArguGPT
df_train_arguGPT_train = pd.read_csv("./ArguGPT/machine-train.csv")
df_ai_train = df_train_arguGPT_train[['text']]
df_ai_train['generated'] = 1
df_train_arguGPT_test = pd.read_csv("./ArguGPT/machine-test.csv")
df_ai_test = df_train_arguGPT_test[['text']]
df_ai_test['generated'] = 1
df_train_arguGPT = pd.concat([df_ai_train, df_ai_test], ignore_index=True) # concatenate
df_train_arguGPT.drop_duplicates(inplace=True)
df_train_arguGPT.dropna(subset=['text'], inplace=True)

# The Imitation game
df_ChatGPT_essay = pd.read_csv("./ChatGPT/ChatGPT_essay.csv")
df_chatgpt = df_ChatGPT_essay[['responses']].rename(columns={'responses': 'text'})
df_chatgpt['generated'] = 1
df_Human_essay_1 = pd.read_csv("./Human/human_essay_1.csv")
df_Human_essay_2 = pd.read_csv("./Human/human_essay_2.csv")
df_human = pd.concat([df_Human_essay_1[["essays"]].rename(columns={'essays': 'text'}), 
                      df_Human_essay_2[["text"]]], ignore_index=True)
df_human['generated'] = 0
df_chatgpt_human = pd.concat([df_chatgpt, df_human], ignore_index=True) # concatenate
df_chatgpt_human.drop_duplicates(inplace=True)
df_chatgpt_human.dropna(subset=['text'], inplace=True)

# artem9k detection pile
from datasets import load_dataset
huggingFace_dataset = load_dataset("artem9k/ai-text-detection-pile")
df_huggingFace_dataset = huggingFace_dataset["train"].to_pandas()
df_huggingFace_dataset.drop(columns=['id'], inplace=True)
df_huggingFace_dataset.rename(columns = {"source":"generated"}, inplace=True)
mapping = {'human': 0, 'ai': 1}
df_huggingFace_dataset['generated'] = df_huggingFace_dataset['generated'].replace(mapping)
count_generated = df_huggingFace_dataset['generated'].value_counts()
## Assuming count_generated[0] is the count for 'generated' == 0 and count_generated[1] is for 'generated' == 1
df_huggingFace_human = df_huggingFace_dataset[df_huggingFace_dataset['generated'] == 0].head(min(100000, count_generated[0]))
df_huggingFace_ai = df_huggingFace_dataset[df_huggingFace_dataset['generated'] == 1].head(min(100000, count_generated[1]))
## Concatenate the two DataFrames
df_huggingFace_dataset = pd.concat([df_huggingFace_human, df_huggingFace_ai]) # concatenate
df_huggingFace_dataset.drop_duplicates(inplace=True)
df_huggingFace_dataset.dropna(subset=['text'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ai_train['generated'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ai_test['generated'] = 1
  df_huggingFace_dataset['generated'] = df_huggingFace_dataset['generated'].replace(mapping)


### combine data

In [4]:
df_train_essays_final = pd.concat(
    [df_huggingFace_dataset[["text", "generated"]], 
     df_chatgpt_human[["text", "generated"]], 
     df_train_arguGPT[["text", "generated"]], 
     df_train_drcat_ext[["text", "generated"]], 
     df_train_drcat_04[["text", "generated"]], 
     df_train_essays[["text", "generated"]]],
     ignore_index=True)
df_train_essays_final.head()

Unnamed: 0,text,generated
0,12 Years a Slave: An Analysis of the Film Essa...,0
1,20+ Social Media Post Ideas to Radically Simpl...,0
2,2022 Russian Invasion of Ukraine in Global Med...,0
3,533 U.S. 27 (2001) Kyllo v. United States: The...,0
4,A Charles Schwab Corporation Case Essay\n\nCha...,0


In [5]:
df_train_essays_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256882 entries, 0 to 256881
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text       256882 non-null  object
 1   generated  256882 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.9+ MB


In [7]:
df_train_essays_final['generated'].value_counts()

generated
0    136226
1    120656
Name: count, dtype: int64

## data augmentation

In [16]:
import random
import re

# cap random characters
def random_capitalization(paragraph): 
    return ''.join(c.upper() if random.random() < 0.2 else c for c in paragraph)

# in a paragraph, manipulate a word 20% of the time, which one of the following operation
# 1. randomly delete a character in a word
# 2. randomly insert a char into a word
# 3. randomly swap 2 chars in a word
def manipulate_text(paragraph):
    words = paragraph.split()
    manipulated_words = []
    for word in words:
        if random.random() <= 0.2:
            word = char_manipulation(word)
        manipulated_words.append(word)
    
    return ' '.join(manipulated_words)

def char_manipulation(word):
    operations = [
        lambda w: w[:idx] + w[idx+1:] if len(w) > 0 else '',  # Delete at a random char
        lambda w: w[:idx] + random.choice('abcdefghijklmnopqrstuvwxyz') + w[idx:] if len(w) > 0 else random.choice('abcdefghijklmnopqrstuvwxyz'),  # Insert at a random index
        lambda w: swap_random_chars(w) if len(w) > 1 else w  # Swapping two random characters
    ]
    idx = random.randint(0, len(word))
    return random.choice(operations)(word)

def swap_random_chars(word):
    a, b = random.sample(range(len(word)), 2)
    word_list = list(word)
    word_list[a], word_list[b] = word_list[b], word_list[a]
    return ''.join(word_list)

# randomly shuffle the paragraph's sentences
def shuffle_sentences(paragraph):
    sentences = re.split(r'(?<=[.!?]) +', paragraph)
    random.shuffle(sentences)
    shuffled_paragraph = ' '.join(sentences).strip()
    return shuffled_paragraph

In [17]:
def apply_text_transformation(df, sample_rate, func, seed=1):
    """Applies a text transformation function to a fraction of rows in a DataFrame column 'text'."""
    # Sample indices to transform
    sampled_indices = df.sample(frac=sample_rate, random_state=seed).index

    # Copy the sampled data
    df_augmented = df.loc[sampled_indices].copy()

    # Apply transformation
    df_augmented['text'] = df_augmented['text'].apply(func)

    # Update the og df
    df.update(df_augmented)

sample_rate = 0.1

# Shuffle sentences
apply_text_transformation(df_train_essays_final, sample_rate, shuffle_sentences)

# Character manipulation
apply_text_transformation(df_train_essays_final, sample_rate, manipulate_text)

# Random capitalization
apply_text_transformation(df_train_essays_final, sample_rate, random_capitalization)

In [18]:
s = '''Once upon a time, in a quaint little town nestled among the rolling hills, there lived a man named William. 
He was blessed with six sons, each one bringing immeasurable joy to his life. 
His house was always a cacophony of laughter, mischief, and love. 
However, William never could have imagined that his seventh son would hold such a grim destiny.'''

w = 'hello'

capitalization = random_capitalization(s)
char_mani = manipulate_text(s)
shuffle_s = shuffle_sentences(s)
print(capitalization)
print()
print(char_mani)
print()
print(shuffle_s)

OncE upon a time, in a quAiNt little Town nestLed amONg the roLlinG hiLls, tHere Lived a man NAmed William. 
He was BlesSed wItH SIX sons, EAch One bRinging immeasUrable joy to his lIfe. 
HIs HouSe was always a caCOphony of laughter, Mischief, aNd loVe. 
HOwEver, William NEvEr could hAve imagIned thaT his seventh soN would hOld such a Grim deStiNy.

Once upon ak time, in a quaint little town nestdel among the rolling hills, there live a man ndmea William. He aws blessed with six sons, each one bringing immeasurable joy to his l.fei His house was always a cacophony of laughter, mischief, and love. However, William never could hkave imagirned that hsi seventh son ould hold such a grim destiny.

He was blessed with six sons, each one bringing immeasurable joy to his life. 
However, William never could have imagined that his seventh son would hold such a grim destiny. Once upon a time, in a quaint little town nestled among the rolling hills, there lived a man named William. 
His house was 

# Train model

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_du1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                                num_labels=2,
                                                                dropout=0.5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model_du1.to(device)

cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.5, inplace=False)
 

In [11]:
SEQ_LENGTH = 512

from sklearn.model_selection import train_test_split
X_train, X_tmp, y_train, y_tmp = train_test_split(df_train_essays_final["text"],
                                                    df_train_essays_final["generated"],
                                                    test_size=0.20,
                                                    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_tmp,
                                                y_tmp,
                                                test_size=0.50,
                                                random_state=42)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        
        return text, label

train_data = TextDataset(X_train, y_train)
val_data = TextDataset(X_val, y_val)
test_data = TextDataset(X_test, y_test)

def collate_batch(batch):
    texts, labels = zip(*batch)
    tokens = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor(labels)
    return tokens, labels

In [12]:
train_data[0]

('Cross-Cultural Promotion. Hazelton International Report\n\nIntroduction\n\nFor construction projects, careful planning and system thinking have a great role in effective project development and implementation. One successful function of system thinking is in the area of project management. In this case, a project is divided into manageable pieces to facilitate its completion and management, then the pieces are put back together using interface management, and the system should work properly. The important lesson is that it is not possible to optimize the parts of the system and expect that the whole system will function optimally. The optimization of the whole system takes precedence and requires that the subsystems (i.e., the manageable parts of the project) are therefore sub-optimized. It is applicable to a project, a set of projects that constitute a program, a set of programs that constitute an organizational plan, a set of organizational plans that follow selected strategies, a 

In [13]:
# from torch.optim import AdamW

learning_rate = 7e-5
num_epochs = 2 # change later
batch_size = 64 # change later

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

valid_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# it is recommended to use AdamW for distilBERT models

# should we include scheduler?




In [14]:
optimizer = AdamW(model_du1.parameters(), lr=learning_rate)

model_du1.train()

train_loss_values = []
valid_loss_values = []
steps = []
val_total_count = 50
graph_every = 200

for epoch in range(num_epochs):
    # Training phase 
    epoch_iterator = tqdm(train_loader, desc="Iteration")
    # calculate loss on an epoch level 

    for step, (tokens, labels) in enumerate(epoch_iterator):
        input_ids = tokens['input_ids'].to(model_du1.device)
        attention_mask = tokens['attention_mask'].to(model_du1.device)
        labels = labels.to(model_du1.device)

        optimizer.zero_grad()
        outputs = model_du1(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (step + 1) % graph_every == 0: # exclude step 0
            print('hi')
            train_loss_values.append(loss.item())
            steps.append(step + epoch * len(epoch_iterator))

            # Validation
            model_du1.eval()
            total_val_loss = 0
            val_count = 0
            with torch.no_grad():
                for val_tokens, val_labels in valid_loader:
                    val_input_ids = val_tokens['input_ids'].to(model_du1.device)
                    val_attention_mask = val_tokens['attention_mask'].to(model_du1.device)
                    val_labels = val_labels.to(model_du1.device)
                    val_outputs = model_du1(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
                    val_loss = val_outputs.loss
                    total_val_loss += val_loss.item()
                    val_count += 1
                    if val_count > val_total_count:
                        break

            avg_val_loss = total_val_loss / val_total_count
            valid_loss_values.append(avg_val_loss)


            # Switch back to training mode
            model_du1.train()

            # Plotting
            plt.figure(figsize=(10, 6))
            plt.plot(steps, train_loss_values, label='Training Loss')
            plt.plot(steps, valid_loss_values, label='Validation Loss', linestyle='--')
            plt.xlabel('Step')
            plt.ylabel('Loss')
            plt.title(f'Training & Validation Loss Every {graph_every} Steps')
            plt.legend()
            plt.show()
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_filename = f'./finetuned_uncased_250kdata_{timestamp}_{epoch}'
    model_du1.save_pretrained(model_filename)
    print(f"Model saved to {model_filename}")


# Save the model with the learning rate in the filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f'./finetuned_uncased_250kdata_{timestamp}_final'
model_du1.save_pretrained(model_filename)
print(f"Model saved to {model_filename}")


Iteration:   1%|          | 38/3212 [00:32<45:24,  1.16it/s] 


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "d:\Anaconda3\envs\school\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\berna\AppData\Local\Temp\ipykernel_39404\831553052.py", line 16, in <module>
    for step, (tokens, labels) in enumerate(epoch_iterator):
  File "d:\Anaconda3\envs\school\lib\site-packages\tqdm\std.py", line 1181, in __iter__
    for obj in iterable:
  File "d:\Anaconda3\envs\school\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
    data = self._next_data()
  File "d:\Anaconda3\envs\school\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "d:\Anaconda3\envs\school\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\berna\AppData\Local\Temp\ipykernel_39404\1151847221.py", line 34

In [None]:



test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


def calculate_accuracy(model, data_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    size_test = 10
    test_count = 0

    with torch.no_grad(): 
        for batch in tqdm(data_loader):
            input_ids = batch[0]['input_ids'].to(model.device)
            attention_mask = batch[0]['attention_mask'].to(model.device)
            labels = batch[1].to(model.device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)

            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)
            test_count += 1
            if test_count > size_test:
                break

    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate test accuracy
chosen_model = "finetuned_uncased_250kdata_20240418_155702_final"
model_test = DistilBertForSequenceClassification.from_pretrained(chosen_model)
test_accuracy = calculate_accuracy(model_test, test_loader)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

  2%|▏         | 10/402 [02:05<1:22:06, 12.57s/it]

Test Accuracy: 99.86%



