In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Training_Essay_Data.csv')

In [3]:
data.head()

Unnamed: 0,text,generated
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1


In [4]:
ai_text_generated = data['generated'].value_counts()


In [5]:
ai_text_generated

generated
0    17508
1    11637
Name: count, dtype: int64

In [6]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [7]:
#Tokenize the dataset
input_ids = []
attention_mask = []
for index, text in enumerate(data['text']):
    tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    input_ids.append(tokenized["input_ids"])
    attention_mask.append(tokenized["attention_mask"])      


In [8]:

input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_mask = torch.tensor(attention_mask, dtype=torch.long)
labels = torch.tensor(data["generated"].values, dtype=torch.long)


In [9]:
print(input_ids)

tensor([[  101,  8185,   118,  ...,  7562,  2116,   102],
        [  101,  8185,  4299,  ...,  1468,   119,   102],
        [  101,   138, 21779,  ...,  1105,  2043,   102],
        ...,
        [  101,  1247,   112,  ...,  1103,  5072,   102],
        [  101,  1249,  1195,  ...,  3702,  1403,   102],
        [  101, 16644,  1138,  ...,  1103, 11738,   102]])


In [10]:
print(attention_mask)

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split set
batch_size = 16
tokenized_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, labels)
train_data, test_data = train_test_split(tokenized_dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)



In [13]:
# Create DataLoaders for training, validation, and test
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [14]:
for batch_idx, batch in enumerate(train_dataloader):
    if batch_idx == 0:
        input_ids, attention_mask, labels = batch
        # Print or process the first batch here
        print("Batch 0 - Input IDs:", input_ids)
        print("Batch 0 - Attention Mask:", attention_mask)
        print("Batch 0 - Labels:", labels)
        break  # Stop after processing the first batch


Batch 0 - Input IDs: tensor([[  101, 17408,   131,  ...,  6818,  1116,   102],
        [  101,  9953, 21804,  ...,  3337, 25677,   102],
        [  101,  1109,  8366,  ..., 11270,  1654,   102],
        ...,
        [  101,  1109,  1642,  ...,  1437,  1128,   102],
        [  101, 11772,   131,  ...,  2108, 10238,   102],
        [  101,   146,  1341,  ...,     0,     0,     0]])
Batch 0 - Attention Mask: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])
Batch 0 - Labels: tensor([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0])


In [15]:
from transformers import BertModel, BertTokenizer
import torch.nn as nn

class AITextDetectionModel(nn.Module):
    def __init__(self, num_classes=2):
        super(AITextDetectionModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits


In [16]:
device = torch.device("cuda")

In [17]:
# instantiate your model
AI_text_model = AITextDetectionModel(num_classes=2).to(device)

In [18]:
print(AI_text_model)

AITextDetectionModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [19]:
# define loss function
criterion = nn.CrossEntropyLoss()

# define optimizer
optimizer = torch.optim.Adam(AI_text_model.parameters(), lr=0.01)

In [20]:
from tqdm import tqdm
num_epochs = 5
print_interval = 500  

for epoch in range(num_epochs):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    counter = 0  

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        # Unpack the batch into input_ids, attention_mask, and labels
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = AI_text_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        # print running loss for each batch after every 500 iterations
        running_loss += loss.item()
        counter += 1

        if counter % print_interval == 0 or counter == len(train_dataloader):
            avg_loss = running_loss / counter
            avg_acc = correct_predictions / total_predictions
            tqdm.write(f'Train Loss: {avg_loss:.3f}, Train Acc: {avg_acc:.3f}', end='\r')

    # Print at the end of each epoch
    tqdm.write(f'Epoch {epoch+1}, Train Loss: {avg_loss:.3f}, Train Acc: {avg_acc:.3f}')
    print(f"Epoch {epoch+1} finished")


Epoch 1/5:  34%|███▍      | 500/1458 [2:33:35<4:54:58, 18.47s/it]

Train Loss: 1.048, Train Acc: 0.528

Epoch 1/5:  69%|██████▊   | 1000/1458 [5:06:55<2:19:40, 18.30s/it]

Train Loss: 1.001, Train Acc: 0.525

Epoch 1/5: 100%|██████████| 1458/1458 [7:26:44<00:00, 18.38s/it]  


Epoch 1, Train Loss: 0.974, Train Acc: 0.524
Epoch 1 finished


Epoch 2/5:  34%|███▍      | 500/1458 [2:32:57<4:53:01, 18.35s/it]

Train Loss: 1.099, Train Acc: 0.520

Epoch 2/5:  69%|██████▊   | 1000/1458 [5:06:01<2:20:00, 18.34s/it]

Train Loss: 1.029, Train Acc: 0.521

Epoch 2/5: 100%|██████████| 1458/1458 [7:25:42<00:00, 18.34s/it]  


Epoch 2, Train Loss: 1.025, Train Acc: 0.522
Epoch 2 finished


Epoch 3/5:  34%|███▍      | 500/1458 [2:32:46<4:52:55, 18.35s/it]

Train Loss: 0.932, Train Acc: 0.536

Epoch 3/5:  69%|██████▊   | 1000/1458 [5:05:02<2:18:58, 18.21s/it]

Train Loss: 0.957, Train Acc: 0.536

Epoch 3/5: 100%|██████████| 1458/1458 [7:23:43<00:00, 18.26s/it]  


Epoch 3, Train Loss: 0.986, Train Acc: 0.531
Epoch 3 finished


Epoch 4/5:  34%|███▍      | 500/1458 [2:32:19<4:49:32, 18.13s/it]

Train Loss: 1.057, Train Acc: 0.523

Epoch 4/5:  69%|██████▊   | 1000/1458 [5:04:37<2:21:33, 18.55s/it]

Train Loss: 1.024, Train Acc: 0.529

Epoch 4/5: 100%|██████████| 1458/1458 [7:26:00<00:00, 18.35s/it]  


Epoch 4, Train Loss: 1.016, Train Acc: 0.525
Epoch 4 finished


Epoch 5/5:  34%|███▍      | 500/1458 [2:33:26<4:52:17, 18.31s/it]

Train Loss: 1.020, Train Acc: 0.530

Epoch 5/5:  69%|██████▊   | 1000/1458 [5:07:34<2:20:04, 18.35s/it]

Train Loss: 1.041, Train Acc: 0.520

Epoch 5/5: 100%|██████████| 1458/1458 [7:28:04<00:00, 18.44s/it]  

Epoch 5, Train Loss: 1.013, Train Acc: 0.524
Epoch 5 finished





In [21]:
# Validation loop
with torch.no_grad():
    AI_text_model.eval()  # Set the model to evaluation mode
    valid_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    for batch in val_dataloader:
        # Unpack the batch into input_ids, attention_mask, and labels
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        # forward
        outputs = AI_text_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        # calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        # calculate running loss
        valid_loss += loss.item()

    avg_loss = valid_loss / len(val_dataloader)
    avg_acc = correct_predictions / total_predictions
    print(f'Validation Loss: {avg_loss:.3f}, Validation Acc: {avg_acc:.3f}')

Validation Loss: 1.359, Validation Acc: 0.612


In [22]:
# Test loop
with torch.no_grad():
    AI_text_model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    for batch in test_dataloader:
        # Unpack the batch into input_ids, attention_mask, and labels
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        # forward
        outputs = AI_text_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        # calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        # calculate running loss
        test_loss += loss.item()

    avg_loss = test_loss / len(test_dataloader)
    avg_acc = correct_predictions / total_predictions
    print(f'Test Loss: {avg_loss:.3f}, Test Acc: {avg_acc:.3f}')

Test Loss: 1.398, Test Acc: 0.602


In [23]:
# save the model
torch.save(AI_text_model.state_dict(), 'ai_text_model.pth')

In [28]:


# Sample text to evaluate
sample_text = "  Our brain is so powerful that it can easily imagine scenarios and make use of our senses. The job of such an essay is to appeal to our senses in a way that it creates an image in our minds. Hence a descriptive essay plays with at least one of our five senses (touch, smell, taste, hearing, sight)."

# List to store input IDs and attention masks
input_ids = []
attention_mask = []

# Tokenize and preprocess the sample text
tokenized = tokenizer(sample_text, padding="max_length", truncation=True, max_length=512)
input_ids.append(tokenized["input_ids"])
attention_mask.append(tokenized["attention_mask"])

# Convert input_ids and attention_mask to PyTorch Tensors
input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_mask = torch.tensor(attention_mask, dtype=torch.long)

# Set the model to evaluation mode
AI_text_model.eval()

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Forward pass
with torch.no_grad():
    outputs = AI_text_model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_class = torch.argmax(outputs, dim=1).item()

# Define class labels (0 for fake, 1 for real)
class_labels = ["NOT AI", "AI TEXT"]

# Get the predicted label
predicted_label = class_labels[predicted_class]

# Get the probability scores
probability_scores = torch.softmax(outputs, dim=1)
fake_probability = probability_scores[0][0].item()
real_probability = probability_scores[0][1].item()

# Print the result
print(f"Sample text: {sample_text}")
print(f"Predicted label: {predicted_label}")
print(f"Confidence - NOT AI: {fake_probability * 100:.2f}%")
print(f"Confidence - AI TEXT: {real_probability * 100:.2f}%")


Sample text:   Our brain is so powerful that it can easily imagine scenarios and make use of our senses. The job of such an essay is to appeal to our senses in a way that it creates an image in our minds. Hence a descriptive essay plays with at least one of our five senses (touch, smell, taste, hearing, sight).
Predicted label: NOT AI
Confidence - NOT AI: 96.81%
Confidence - AI TEXT: 3.19%
