In [37]:
import json
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from torch.nn.functional import softmax
from sentence_transformers import SentenceTransformer
import pandas as pd



In [38]:
df = pd.read_csv('Training_Essay_Data.csv')


In [39]:
df

Unnamed: 0,text,generated
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1
...,...,...
29140,There has been a fuss about the Elector Colleg...,0
29141,Limiting car usage has many advantages. Such a...,0
29142,There's a new trend that has been developing f...,0
29143,As we all know cars are a big part of our soci...,0


In [40]:
df.shape

(29145, 2)

In [41]:
df.isna().sum()

Unnamed: 0,0
text,0
generated,0


In [42]:
df['text'].duplicated().sum()

1805

In [43]:
df.drop_duplicates(subset=['text'],inplace=True)

In [44]:
df.shape

(27340, 2)

In [45]:
df.isna().sum()

Unnamed: 0,0
text,0
generated,0


In [51]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def encode_intent(text):
    """Tokenizes text and returns input IDs & attention mask."""
    encoding = bert_tokenizer(text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    return encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze()


In [52]:
data = []

In [53]:
for x in df['text']:
  data.append(encode_intent(x))

In [76]:
from sklearn.model_selection import train_test_split

In [77]:
X_train, X_test, y_train, y_test = train_test_split(data, df['generated'], test_size=0.2, random_state=42)

In [78]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


In [79]:
class CustomDataset(Dataset):

  def __init__(self, X_train,labels):

    self.X_train = X_train
    self.labels=labels

  def __len__(self):

    return len(self.labels)

  def __getitem__(self, index):
    input_ids, attention_mask = self.X_train[index]  # Unpack tuple
    return input_ids, attention_mask, self.labels[index]


In [80]:
y_train = torch.tensor(y_train.values, dtype=torch.long)


In [64]:
X_train[0]

(tensor([  101,  4442, 11640,  2031,  2019,  7461,  2006,  3071,  1005,  1055,
          2166,  1012,  2111,  2224,  2037,  3526,  3042,  2005,  2471,  2673,
          2107,  2004,  1024,  3793,  2075,  1010,  3331,  1010,  4041,  2477,
          1010,  4385,  1012,  2070,  2111,  2064,  2079,  1037,  2843,  2007,
          2037,  3526,  3042,  1999,  2037,  2192,  1012,  3071,  2003,  2467,
          4208,  2006,  2037,  3042,  2138, 11640,  2024,  2590,  2000,  3071,
          1012,  2108,  2006,  3526, 11640,  2096,  4439,  2003,  1037,  3809,
          3808,  3891,  1012,  2043,  2115,  2006,  2115,  3526,  3042,  2096,
          4439,  2017,  5293,  2008,  2115,  4439,  2138,  2115,  2667,  2000,
          3793,  1012,  2108,  2006,  3526, 11640,  2096,  4439,  2003,  2025,
          3647,  2017,  2064,  2031,  1037,  2482,  5823,  1998,  3480,  1996,
          2060,  2711,  2017,  2718,  1012,  2087,  1997,  2482, 19119,  2272,
          2013,  2043,  2111,  2006,  2037,  3042,  

In [81]:
dataset=CustomDataset(X_train,y_train)
train_loader=DataLoader(dataset,batch_size=16,shuffle=True)

In [82]:
from transformers import AdamW


In [83]:

# Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

# Optimizer & Mixed Precision Scaling
optimizer = AdamW(bert_model.parameters(), lr=5e-5, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler()  # Enables Mixed Precision for speed

# Training Loop
epochs = 3
bert_model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device, non_blocking=True) for x in batch]

        optimizer.zero_grad()

        # Mixed Precision Training
        with torch.cuda.amp.autocast():
            outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backpropagation with Gradient Clipping
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)  # Prevent gradient explosion
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss / len(train_loader):.4f}")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()  # Enables Mixed Precision for speed
  with torch.cuda.amp.autocast():


Epoch 1 completed. Avg Loss: 0.4986
Epoch 2 completed. Avg Loss: 0.1012
Epoch 3 completed. Avg Loss: 0.0532


In [84]:
# Save model
bert_model.save_pretrained("AI_DETECTOR")
bert_tokenizer.save_pretrained("AI_DETECTOR")


('AI_DETECTOR/tokenizer_config.json',
 'AI_DETECTOR/special_tokens_map.json',
 'AI_DETECTOR/vocab.txt',
 'AI_DETECTOR/added_tokens.json')

In [85]:
!zip -r AI_DETECTOR.zip AI_DETECTOR


  adding: AI_DETECTOR/ (stored 0%)
  adding: AI_DETECTOR/config.json (deflated 49%)
  adding: AI_DETECTOR/tokenizer_config.json (deflated 75%)
  adding: AI_DETECTOR/model.safetensors (deflated 7%)
  adding: AI_DETECTOR/special_tokens_map.json (deflated 42%)
  adding: AI_DETECTOR/vocab.txt (deflated 53%)


In [86]:
import os

file_path = "AI_DETECTOR.zip"
file_size = os.path.getsize(file_path)  # Size in bytes
print(f"File Size: {file_size / (1024 * 1024):.2f} MB")  # Convert to MB


File Size: 386.58 MB


In [92]:
!cp -r /content/AI_DETECTOR /content/drive/MyDrive/AI/


In [93]:
!ls /content/drive/MyDrive/AI/


config.json  model.safetensors	special_tokens_map.json  tokenizer_config.json	vocab.txt


In [88]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification

# Load Model & Tokenizer from Saved Directory
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertForSequenceClassification.from_pretrained("AI_DETECTOR").to(device)
bert_tokenizer = BertTokenizer.from_pretrained("AI_DETECTOR")

bert_model.eval()  # Set model to evaluation mode

def encode_intent(text):
    """Tokenizes text and returns input IDs & attention mask."""
    encoding = bert_tokenizer(text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    return encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze()

def predict(text):
    """Predicts whether a given text is AI-generated or human-written with confidence scores."""
    input_ids, attention_mask = encode_intent(text)

    input_ids = input_ids.unsqueeze(0).to(device)
    attention_mask = attention_mask.unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1).squeeze().cpu().numpy()  # Apply softmax

    ai_confidence = probs[1] * 100  # Probability of AI-generated class
    human_confidence = probs[0] * 100  # Probability of Human-written class

    return f"🔹 AI-Generated: {ai_confidence:.2f}% | 🔹 Human-Written: {human_confidence:.2f}%"

# 🔹 Predict on a Sample Text
sample_text = '''Artificial intelligence has revolutionized multiple industries, enhancing efficiency and decision-making. Large language models, such as GPT and BERT, enable natural language processing at an unprecedented scale. These models analyze vast amounts of textual data, generating human-like responses. However, concerns regarding AI-generated misinformation and ethical considerations continue to emerge. As AI advances, responsible usage and regulation become critical for maintaining credibility and transparency in digital communications.'''

print("Prediction:", predict(sample_text))


Prediction: 🔹 AI-Generated: 100.00% | 🔹 Human-Written: 0.00%


In [None]:
X_test[0][0].type()

In [96]:
from sklearn.metrics import accuracy_score


In [97]:
# Convert `y_test` properly
if isinstance(y_test, pd.Series):  # If y_test is a Pandas Series, convert it to NumPy
    y_test = y_test.reset_index(drop=True)  # Reset index to avoid KeyError

# Prepare Test Data
input_ids, attention_masks = [], []
for i in range(len(X_test)):
    input_ids.append(X_test[i][0])  # Extract input_ids
    attention_masks.append(X_test[i][1])  # Extract attention_mask

# Create DataLoader
batch_size = 16
test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluate Model
bert_model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9771
