In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import datetime
import random
from transformers import AutoConfig
import random
seed_val = 42  # Define a seed value for random number generation
random.seed(seed_val)  # Set the seed for Python's built-in random module
np.random.seed(seed_val)  # Set the seed for NumPy's random module
torch.manual_seed(seed_val)  # Set the seed for PyTorch's random number generator
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_val)  # Set the seed for all GPUs if CUDA is available

In [4]:
import json
data_list = []
file_path = "/kaggle/input/subtaskB_train.jsonl"
with open(file_path, 'r') as file:
    for line in file:
        json_object = json.loads(line)
        data_list.append(json_object)

test_list = []
file_path_test = "/kaggle/input/subtaskB_dev.jsonl"
with open(file_path_test, 'r') as file:
    for line in file:
        json_object = json.loads(line)
        test_list.append(json_object)

random.shuffle(data_list)
random.shuffle(test_list)

In [5]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [6]:
import pandas as pd

df = pd.DataFrame(data_list)
model_counts = df.groupby('model').size().reset_index(name='Count')
del df
label_map = {}
label_list = list(model_counts.model)
label_list.append("Fake")
for (i, label) in enumerate(label_list):
    label_map[label] = i
inverted_label_map = {value: key for key, value in label_map.items()}
display(model_counts)

Unnamed: 0,model,Count
0,bloomz,11998
1,chatGPT,11995
2,cohere,11336
3,davinci,11999
4,dolly,11702
5,human,11997


In [7]:
labeled_examples = []
unlabeled_examples = []
test_examples = []

for expl in data_list[: int(len(data_list) * 0.01)]:
    labeled_examples.append((expl['text'], expl["model"]))

for expl in data_list[int(len(data_list) * 0.01): int(len(data_list) * 0.05)]:
    unlabeled_examples.append((expl['text'], 'Fake'))

for expl in test_list:
    test_examples.append((expl['text'], expl["model"]))


In [8]:
from transformers import BertModel, BertTokenizer

class AdapterLayer(nn.Module):
    def __init__(self, input_size, adapter_size):
        super(AdapterLayer, self).__init__()
        self.down_project = nn.Linear(input_size, adapter_size)
        self.up_project = nn.Linear(adapter_size, input_size)

    def forward(self, x):
        down_projected = self.down_project(x)
        activated = nn.functional.relu(down_projected)
        up_projected = self.up_project(activated)
        return x + up_projected

class BertWithAdapters(BertModel):
    def __init__(self, config):
        super(BertWithAdapters, self).__init__(config)
        self.adapters = nn.ModuleList([AdapterLayer(config.hidden_size, adapter_size=64) for _ in range(config.num_hidden_layers)])
        self.logit = nn.Linear(768, 6)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state

        for adapter_layer in self.adapters:
            sequence_output = adapter_layer(sequence_output)
        output = sequence_output.mean(dim=1)
        return output

model_name = "bert-base-cased"
transformer = BertWithAdapters.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertWithAdapters were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['bert.adapters.0.down_project.bias', 'bert.adapters.0.down_project.weight', 'bert.adapters.0.up_project.bias', 'bert.adapters.0.up_project.weight', 'bert.adapters.1.down_project.bias', 'bert.adapters.1.down_project.weight', 'bert.adapters.1.up_project.bias', 'bert.adapters.1.up_project.weight', 'bert.adapters.10.down_project.bias', 'bert.adapters.10.down_project.weight', 'bert.adapters.10.up_project.bias', 'bert.adapters.10.up_project.weight', 'bert.adapters.11.down_project.bias', 'bert.adapters.11.down_project.weight', 'bert.adapters.11.up_project.bias', 'bert.adapters.11.up_project.weight', 'bert.adapters.2.down_project.bias', 'bert.adapters.2.down_project.weight', 'bert.adapters.2.up_project.bias', 'bert.adapters.2.up_project.weight', 'bert.adapters.3.down_project.bias', 'bert.adapters.3.down_project.weight', 'bert.adapters.3.up_project.bias', 'bert.adapters

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [9]:
import math
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

def generate_data_loader(input_examples, label_masks, label_map, do_shuffle=False, balance_label_examples=False, batch_size=16):
    '''
    Generate a DataLoader given the input examples, optionally masked if they are not labeled.

    Args:
        input_examples (list): List of input examples.
        label_masks (list): List of label masks.
        label_map (dict): Mapping of labels to IDs.
        do_shuffle (bool, optional): Whether to shuffle the data. Defaults to False.
        balance_label_examples (bool, optional): Whether to balance labeled examples. Defaults to False.
        batch_size (int, optional): Batch size for DataLoader. Defaults to 16.

    Returns:
        DataLoader: DataLoader object for training or validation.
    '''
    examples = []

    # Count the percentage of labeled examples
    num_labeled_examples = sum(label_masks)
    label_mask_rate = num_labeled_examples / len(input_examples)

    # Apply balance if required
    for index, ex in enumerate(input_examples):
        if label_mask_rate == 1 or not balance_label_examples:
            examples.append((ex, label_masks[index]))
        else:
            if label_masks[index]:
                balance = int(1 / label_mask_rate)
                balance = int(math.log(balance, 2))
                balance = max(balance, 1)
                for _ in range(balance):
                    examples.append((ex, label_masks[index]))
            else:
                examples.append((ex, label_masks[index]))

    # Tokenization
    input_ids = []
    input_mask_array = []
    label_mask_array = []
    label_id_array = []

    for text, label_mask in examples:
        encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=256, padding="max_length", truncation=True)
        input_ids.append(encoded_sent)
        label_id_array.append(label_map[text[1]])
        label_mask_array.append(label_mask)

    # Attention masks (to ignore padded input wordpieces)
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        input_mask_array.append(att_mask)

    # Conversion to Tensor
    input_ids = torch.tensor(input_ids)
    input_mask_array = torch.tensor(input_mask_array)
    label_id_array = torch.tensor(label_id_array, dtype=torch.long)
    label_mask_array = torch.tensor(label_mask_array)

    # Build the TensorDataset
    dataset = TensorDataset(input_ids, input_mask_array, label_id_array, label_mask_array)

    # Define the sampler based on shuffle option
    sampler = RandomSampler(dataset) if do_shuffle else SequentialSampler(dataset)

    # Build the DataLoader
    return DataLoader(dataset, sampler=sampler, batch_size=batch_size)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a formatted string in the format hh:mm:ss.

    Args:
        elapsed (float): Time in seconds.

    Returns:
        str: Formatted time string.
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round(elapsed))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [10]:
# Load the train dataset
train_examples = labeled_examples
# The labeled (train) dataset is assigned with a mask set to True
train_label_masks = np.ones(len(labeled_examples), dtype=bool)

# If unlabeled examples are available
if unlabeled_examples:
    train_examples = train_examples + unlabeled_examples
    # The unlabeled (train) dataset is assigned with a mask set to False
    tmp_masks = np.zeros(len(unlabeled_examples), dtype=bool)
    train_label_masks = np.concatenate([train_label_masks, tmp_masks])

# Generate the training DataLoader
train_dataloader = generate_data_loader(train_examples, train_label_masks, label_map, do_shuffle=True, balance_label_examples=True)

# Load the test dataset
# The labeled (test) dataset is assigned with a mask set to True
test_label_masks = np.ones(len(test_examples), dtype=bool)

# Generate the test DataLoader
test_dataloader = generate_data_loader(test_examples, test_label_masks, label_map)


  label_mask_array = torch.tensor(label_mask_array)


In [11]:
# Generator class for generating data
class Generator(nn.Module):
    def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
        super(Generator, self).__init__()
        layers = []
        hidden_sizes = [noise_size] + hidden_sizes
        for i in range(len(hidden_sizes) - 1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.layers = nn.Sequential(*layers)

    def forward(self, noise):
        output_rep = self.layers(noise)
        return output_rep

# Discriminator class for discriminating generated data
class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=6, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes) - 1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers)  # Flattening
        self.logit = nn.Linear(hidden_sizes[-1], num_labels + 1)  # +1 for the probability of this sample being fake/real.
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs

# Define Discriminator and Generator
config = AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
hidden_levels_g = [hidden_size]
hidden_levels_d = [hidden_size]

dropout_rate = 0.2
generator = Generator(output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=len(label_list), dropout_rate=dropout_rate)

# transfer models to GPU
generator.cuda()
discriminator.cuda()
transformer.cuda()
transformer = torch.nn.DataParallel(transformer)


In [12]:
training_stats = []
epsilon = 1e-8  # Small value added for numerical stability
num_epochs = 10  # Number of training epochs
print_each_n_step = 10  # Print progress every n steps
apply_scheduler = False  # Flag to apply learning rate scheduler
warmup_proportion = 0.1  # Percentage of warmup steps for the scheduler
batch_size = 8
num_train_epochs = 10
# Measure total training time
total_time = time.time()

# Define model parameters
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
g_vars = [v for v in generator.parameters()]

# Define optimizers
dis_optimizer = torch.optim.AdamW(d_vars, lr=1e-5)
gen_optimizer = torch.optim.AdamW(g_vars, lr=1e-5)

if apply_scheduler:
    num_train_examples = len(train_examples)
    num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
    num_warmup_steps = int(num_train_steps * warmup_proportion)

    # Get the learning rate scheduler
    scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, num_warmup_steps=num_warmup_steps)
    scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, num_warmup_steps=num_warmup_steps)

# Training loop
for epoch_num in range(num_epochs):
    print("")
    print(f' Epoch = {epoch_num + 1}')
    start_time = time.time()

    total_loss_g = 0  # Initialize total generator loss
    total_loss_d = 0  # Initialize total discriminator loss

    # Set models to training mode
    transformer.train()
    generator.train()
    discriminator.train()

    # Iterate over batches in the training dataloader
    for step, batch in enumerate(train_dataloader):

        if step % print_each_n_step == 0 and not step == 0:
            elapsed = format_time(time.time() - start_time)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_label_mask = batch[3].to(device)

        real_batch_size = b_input_ids.shape[0]

        # Generate fake data
        noise = torch.randn(real_batch_size, 100, device=device)
        gen_rep = generator(noise)

        # Concatenate real and fake data for discriminator input
        disciminator_input = torch.cat([transformer(b_input_ids, attention_mask=b_input_mask), gen_rep], dim=0)
        features, logits, probs = discriminator(disciminator_input)

        # Split discriminator's output for real and fake data
        features_list = torch.split(features, real_batch_size)
        D_real_features = features_list[0]
        D_fake_features = features_list[1]

        logits_list = torch.split(logits, real_batch_size)
        D_real_logits = logits_list[0]
        D_fake_logits = logits_list[1]

        probs_list = torch.split(probs, real_batch_size)
        D_real_probs = probs_list[0]
        D_fake_probs = probs_list[1]

        # Calculate losses
        g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:, -1] + epsilon))
        g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
        g_loss = g_loss_d + g_feat_reg

        logits = D_real_logits[:, 0:-1]
        log_probs = F.log_softmax(logits, dim=-1)
        label2one_hot = torch.nn.functional.one_hot(b_labels, len(label_list))
        per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
        per_example_loss = torch.masked_select(per_example_loss, b_label_mask.to(device))
        labeled_example_count = per_example_loss.type(torch.float32).numel()

        if labeled_example_count == 0:
            D_L_Supervised = 0
        else:
            D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)

        D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
        D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
        d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

        gen_optimizer.zero_grad()
        dis_optimizer.zero_grad()

        # Backward pass and optimization step
        g_loss.backward(retain_graph=True)
        d_loss.backward()

        gen_optimizer.step()
        dis_optimizer.step()

        # Accumulate losses
        total_loss_g += g_loss.item()
        total_loss_d += d_loss.item()

        # Update learning rate with scheduler if applied
        if apply_scheduler:
            scheduler_d.step()
            scheduler_g.step()

    # Calculate average losses
    avg_train_loss_g = total_loss_g / len(train_dataloader)
    avg_train_loss_d = total_loss_d / len(train_dataloader)

    training_time = format_time(time.time() - start_time)

    print("Training")
    print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
    print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
    print("  Training epcoh took: {:}".format(training_time))

    # Validation
    print("Validation")

    start_time = time.time()

    transformer.eval()
    discriminator.eval()
    generator.eval()

    total_test_accuracy = 0
    total_test_loss = 0
    nb_test_steps = 0

    all_preds = []
    all_labels_ids = []
    nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
            _, logits, probs = discriminator(model_outputs)
            filtered_logits = logits[:, 0:-1]
            total_test_loss += nll_loss(filtered_logits, b_labels)

        _, preds = torch.max(filtered_logits, 1)
        all_preds += preds.detach().cpu()
        all_labels_ids += b_labels.detach().cpu()

    all_preds = torch.stack(all_preds).numpy()
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
    print("  Accuracy: {0:.3f}".format(test_accuracy))

    avg_test_loss = total_test_loss / len(test_dataloader)
    avg_test_loss = avg_test_loss.item()

    test_time = format_time(time.time() - start_time)

    print("  Test Loss: {0:.3f}".format(avg_test_loss))
    print("  Test took: {:}".format(test_time))

    training_stats.append(
        {
            'epoch': epoch_num + 1,
            'Training Loss generator': avg_train_loss_g,
            'Training Loss discriminator': avg_train_loss_d,
            'Valid. Loss': avg_test_loss,
            'Valid. Accur.': test_accuracy,
            'Training Time': training_time,
            'Test Time': test_time
        }
    )



 Epoch = 1
  Batch    10  of    267.    Elapsed: 0:00:07.
  Batch    20  of    267.    Elapsed: 0:00:14.
  Batch    30  of    267.    Elapsed: 0:00:20.
  Batch    40  of    267.    Elapsed: 0:00:26.
  Batch    50  of    267.    Elapsed: 0:00:33.
  Batch    60  of    267.    Elapsed: 0:00:39.
  Batch    70  of    267.    Elapsed: 0:00:45.
  Batch    80  of    267.    Elapsed: 0:00:52.
  Batch    90  of    267.    Elapsed: 0:00:58.
  Batch   100  of    267.    Elapsed: 0:01:04.
  Batch   110  of    267.    Elapsed: 0:01:11.
  Batch   120  of    267.    Elapsed: 0:01:17.
  Batch   130  of    267.    Elapsed: 0:01:24.
  Batch   140  of    267.    Elapsed: 0:01:30.
  Batch   150  of    267.    Elapsed: 0:01:36.
  Batch   160  of    267.    Elapsed: 0:01:43.
  Batch   170  of    267.    Elapsed: 0:01:49.
  Batch   180  of    267.    Elapsed: 0:01:55.
  Batch   190  of    267.    Elapsed: 0:02:02.
  Batch   200  of    267.    Elapsed: 0:02:08.
  Batch   210  of    267.    Elapsed: 0:02:14.
 

In [15]:
for stat in training_stats:
    print(stat)

{'epoch': 1, 'Training Loss generator': 0.5251315175817254, 'Training Loss discriminator': 2.8015501624189514, 'Valid. Loss': 1.449839472770691, 'Valid. Accur.': 0.399, 'Training Time': '0:02:50', 'Test Time': '0:00:23'}
{'epoch': 2, 'Training Loss generator': 0.7474294856246491, 'Training Loss discriminator': 1.5577362373973547, 'Valid. Loss': 1.4101380109786987, 'Valid. Accur.': 0.44466666666666665, 'Training Time': '0:02:49', 'Test Time': '0:00:23'}
{'epoch': 3, 'Training Loss generator': 0.7402147367205959, 'Training Loss discriminator': 1.0584569765387404, 'Valid. Loss': 2.1029582023620605, 'Valid. Accur.': 0.4686666666666667, 'Training Time': '0:02:49', 'Test Time': '0:00:23'}
{'epoch': 4, 'Training Loss generator': 0.7336423379205139, 'Training Loss discriminator': 0.8370720717344391, 'Valid. Loss': 2.3337581157684326, 'Valid. Accur.': 0.49766666666666665, 'Training Time': '0:02:49', 'Test Time': '0:00:23'}
{'epoch': 5, 'Training Loss generator': 0.7295457709594613, 'Training Lo