In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AdamW
import ast
import nltk

In [None]:
df = pd.read_csv('hc3.csv')
human_paragraphs = [''.join(ast.literal_eval(human_paragraph)).replace('\n', '').split('.') for human_paragraph in list(df['human_answers'])]
chatgpt_paragraphs = [''.join(ast.literal_eval(chatgpt_paragraph)).replace('\n', '').split('.') for chatgpt_paragraph in list(df['chatgpt_answers'])]

In [None]:
def create_overlapping_sequences(paragraphs, num_sentences):
    combined = []
    human_combined = []
    chatgpt_combined = []
    for paragraph in paragraphs:
        sentences = paragraph
        for i in range(len(sentences) - num_sentences + 1):
            combined.append(' '.join(sentences[i:i+num_sentences]).strip())
    return combined

human_combined = create_overlapping_sequences(human_paragraphs, 3)
chatgpt_combined = create_overlapping_sequences(chatgpt_paragraphs, 3)

In [None]:
human_df = pd.DataFrame({'text': human_combined})

In [None]:
ai_df = pd.DataFrame({'text': chatgpt_combined})

In [4]:
from tqdm import tqdm

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = []
        for i, row in tqdm(data.iterrows(), total=len(data)):
            encoded = self.tokenizer.encode_plus(row['text'], add_special_tokens=True, padding='max_length',
                                                 truncation=True, max_length=max_length, return_tensors='pt')
            self.data.append((encoded['input_ids'], encoded['attention_mask'], row['label']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, attention_mask, label = self.data[idx]
        return {'input_ids': input_ids.squeeze(0), 'attention_mask': attention_mask.squeeze(0), 'label': label}

In [5]:
# Split the data into train, validation and test sets
def train_val_test_split(ai_df, human_df, val_size=0.2, test_size=0.2):
    ai_df['label'] = 0
    human_df['label'] = 1
    data = pd.concat([ai_df, human_df], ignore_index=True, sort=False)
    data = data.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle the data
    split1 = int(len(data) * (1 - (val_size + test_size)))
    split2 = int(len(data) * (1 - test_size))
    train_data = data[:split1]
    val_data = data[split1:split2]
    test_data = data[split2:]
    return train_data, val_data, test_data

In [None]:
# Load the data
train_data, val_data, test_data = train_val_test_split(ai_df, human_df, val_size=0.2, test_size=0.2)

In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the optimizer and learning rate scheduler
# optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay = 0.01)

optimizer = Adafactor(model.parameters(), lr=1e-3, relative_step=True)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define the training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
import pickle

# Create instances of SentimentDataset for train and eval datasets
train_dataset = SentimentDataset(train_data, tokenizer, max_length=128)
val_dataset = SentimentDataset(val_data, tokenizer, max_length=128)
test_dataset = SentimentDataset(test_data, tokenizer, max_length=128)

# Save the datasets as serialized objects
with open('train_dataset.pickle', 'wb') as f:
    pickle.dump(train_dataset, f)

with open('val_dataset.pickle', 'wb') as f:
    pickle.dump(val_dataset, f)

with open('test_dataset.pickle', 'wb') as f:
    pickle.dump(test_dataset, f)

In [8]:
import pickle

with open('train_dataset.pickle', 'rb') as f:
    train_dataset = pickle.load(f)

with open('val_dataset.pickle', 'rb') as f:
    val_dataset = pickle.load(f)

with open('test_dataset.pickle', 'rb') as f:
    test_dataset = pickle.load(f)

### Training the Model

In [None]:
num_epochs = 5
best_val_accuracy = 0
for epoch in range(num_epochs):
    model.train()
    train_preds = []
    train_labels = []
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
                
        optimizer.step()
        scheduler.step()  # Add this line to update the learning rate
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        train_preds.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    train_acc = accuracy_score(train_labels, train_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Train Accuracy: {train_acc*100:.2f}%')
    
    model.eval()
    
    eval_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    total_correct = 0
    total_samples = 0
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            total_correct += (predictions == labels).sum().item()
            total_samples += len(labels)
            
    accuracy = total_correct / total_samples
    print(f'Test Accuracy: {accuracy*100:.2f}%')
    
    # Evaluate on validation set
    model.eval()
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    total_correct = 0
    total_samples = 0
    val_preds = []
    val_labels = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            total_correct += (predictions == labels).sum().item()
            total_samples += len(labels)
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    print(f'Val Accuracy: {val_acc*100:.2f}%')
    
    # Save the best model based on validation accuracy
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
    
print(f'Best Validation Accuracy: {best_val_accuracy*100:.2f}%')

In [9]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mashwathb24[0m ([33mnyu-tandon[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'BERT_final.ipynb'

In [None]:

sweep_config = {
    'name': 'bert-sweep',
    'method': 'random',
    'metric': {
        
    'name': 'accuracy',
    'goal': 'maximize'
        
    },
    'parameters': {
        'learning_rate': {
            'min': 1e-5,
            'max': 5e-5
    },
        'batch_size': {
            'values': [16, 32, 64, 128]
    },
        'num_epochs': {
            'values': [3, 5, 10]
    }
}
    }

In [None]:
sweep_id = wandb.sweep(sweep_config, project='bert-sentiment-classification')

#### BERT with wandB

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import wandb


def train(config=None):
    if config is None:
        config = {
            'learning_rate': 5e-5,
            'num_epochs': 5,
            'batch_size': 64
        }

    # Initialize the tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Define the training loop
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
    loss_fn = torch.nn.CrossEntropyLoss()

    num_epochs = config['num_epochs']
    best_val_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        train_preds = []
        train_labels = []
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

            progress_bar.set_postfix({'loss': loss.item()})

        train_acc = accuracy_score(train_labels, train_preds)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Train Accuracy: {train_acc*100:.2f}%')

        model.eval()
        val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
        total_correct = 0
        total_samples = 0
        val_preds = []
        val_labels = []
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                total_correct += (predictions == labels).sum().item()
                total_samples += len(labels)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_acc = accuracy_score(val_labels, val_preds)
        print(f'Val Accuracy: {val_acc*100:.2f}%')

        # Save the best model based on validation accuracy
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            torch.save(model.state_dict(), 'best_model.pth')

        wandb.log({'epoch': epoch, 'loss': loss.item(), 'train_accuracy': train_acc, 'val_accuracy': val_acc})

    print(f'Best Validation Accuracy: {best_val_accuracy*100:.2f}%')

def sweep():
    # Define the hyperparameters to tune using the wandb config object
    config_defaults = {
        'learning_rate': 5e-5,
        'num_epochs': 5,
        'batch_size': 64
    }
    wandb.init(config=config_defaults)

    # Retrieve the hyperparameter values from wandb
    config = wandb.config

    # Train the model with the given hyperparameters
    train(config)
    
wandb.agent(sweep_id, function=sweep)

Create sweep with ID: gr39xj51
Sweep URL: https://wandb.ai/nyu-tandon/bert-sentiment-classification/sweeps/gr39xj51


[34m[1mwandb[0m: Agent Starting Run: y2pxx6ty with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 3.4297843990743166e-05
[34m[1mwandb[0m: 	num_epochs: 5


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/5, Loss: 0.0036, Train Accuracy: 98.56%
Val Accuracy: 99.00%


Epoch 2/5: 100%|██████████| 3563/3563 [30:25<00:00,  1.95it/s, loss=0.000656]


Epoch 2/5, Loss: 0.0007, Train Accuracy: 99.64%
Val Accuracy: 99.05%


Epoch 3/5: 100%|██████████| 3563/3563 [27:09<00:00,  2.19it/s, loss=0.000112]


Epoch 3/5, Loss: 0.0001, Train Accuracy: 99.75%
Val Accuracy: 98.21%


Epoch 4/5: 100%|██████████| 3563/3563 [27:10<00:00,  2.18it/s, loss=2.58e-5] 


Epoch 4/5, Loss: 0.0000, Train Accuracy: 99.81%
Val Accuracy: 98.56%


Epoch 5/5: 100%|██████████| 3563/3563 [37:43<00:00,  1.57it/s, loss=0.000372]


Epoch 5/5, Loss: 0.0004, Train Accuracy: 99.84%
Val Accuracy: 98.78%
Best Validation Accuracy: 99.05%


0,1
epoch,▁▃▅▆█
loss,█▂▁▁▂
train_accuracy,▁▇███
val_accuracy,██▁▄▆

0,1
epoch,4.0
loss,0.00037
train_accuracy,0.99839
val_accuracy,0.98781


[34m[1mwandb[0m: Agent Starting Run: ctys20ot with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 4.348854275205252e-05
[34m[1mwandb[0m: 	num_epochs: 5


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/5, Loss: 0.0054, Train Accuracy: 98.45%
Val Accuracy: 97.46%


Epoch 2/5: 100%|██████████| 7125/7125 [29:06<00:00,  4.08it/s, loss=0.0001]  


Epoch 2/5, Loss: 0.0001, Train Accuracy: 99.49%
Val Accuracy: 97.91%


Epoch 3/5:   7%|▋         | 480/7125 [01:57<27:06,  4.09it/s, loss=0.000554]

### Loading Best Model

In [None]:
# Loading Best Model

# Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# #GPT2 Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('models/bert_model_5epochs.pth'))

from transformers import AdamW, get_linear_schedule_with_warmup

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define the training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

### Testing the model

In [None]:
def get_sentiment(text):
    encoded = tokenizer.encode_plus(text, add_special_tokens=True, padding='max_length',
                                     truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
        logits = outputs.logits
        probabilities = torch.sigmoid(logits)
        predictions = torch.argmax(probabilities, dim=1)
        ai_probability = probabilities.detach().cpu().numpy()[0][0]
        human_probability = probabilities.detach().cpu().numpy()[0][1]
        total_probability = ai_probability + human_probability
        ai_percentage = ai_probability / total_probability
        human_percentage = human_probability / total_probability
        print("\n")
        print(f'Percentage of AI content: {ai_percentage*100:.2f}%')
        print(f'Percentage of Human content: {human_percentage*100:.2f}%')
        print("\n")
        sentiment = 'AI-generated' if predictions.item() == 0 else 'Human-generated'
    return sentiment

# Get user input
while True:
    print("\n")
    text = input("Enter a text to classify its sentiment (type 'quit' to exit): \n \n")
    if text.lower() == 'quit':
        break
    sentiment = get_sentiment(text)
    print(f"Sentiment: {sentiment}")
