# GAN 

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from sklearn.utils import resample
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, RobertaForSequenceClassification, AutoTokenizer

In [3]:
import sys
# sys.path.append("/mnt/data/bobae/ai_text_detection")
sys.path.append("/Users/bobaebak/git/ai_text_detection")

from utils.cuda_helper import *

### Set GPU

In [None]:
print_gpu_usage()

In [None]:
print_gpu_devie_name()

In [4]:
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
num_gpus = torch.cuda.device_count()

if num_gpus>1:
    device = "cuda:0"

In [5]:
device

'mps'

In [None]:
print_total_gpu_memory()

### Load json data file and convert to dataframe
- real -> human: 1
- fake -> ai: 0


In [6]:
# human data

# loads a tofel dataset
with open('../dataset/human/tofel.json', "r") as f:
    h_tofel_dataset = json.load(f)

# loads an arxiv dataset
with open('../dataset/human/arxiv.json', "r") as f:
    h_arxiv_dataset = json.load(f)

# loads student essay
with open('../dataset/human/student_essay.json', "r") as f:
    h_essay_dataset = json.load(f)

# loads student computer essay
with open('../dataset/human/student_cs_essay.json', "r") as f:
    h_essay_cs_dataset = json.load(f)

In [7]:
# gpt data

# loads a tofel dataset
with open('../dataset/ai/gpt2medium_tofel.json', "r") as f:
    gpt_tofel_dataset = json.load(f)

# loads an arxiv dataset
with open('../dataset/ai/gpt2medium_arxiv.json', "r") as f:
    gpt_arxiv_dataset = json.load(f)

# loads student essay
with open('../dataset/ai/gpt2medium_essay.json', "r") as f:
    gpt_essay_dataset = json.load(f)

# loads student computer essay
with open('../dataset/ai/gpt2medium_essay_cs.json', "r") as f:
    gpt_essay_cs_dataset = json.load(f)

# loads a tofel dataset
with open('../dataset/ai/gpt35_tofel.json', "r") as f:
    gpt_35_tofel_dataset = json.load(f)

# loads student computer essay
with open('../dataset/ai/gpt35_essay_cs.json', "r") as f:
    gpt_35_essay_cs_dataset = json.load(f)

In [8]:
h_dataset = []
for i in [h_tofel_dataset, h_arxiv_dataset, h_essay_dataset, h_essay_cs_dataset]:
    h_dataset.extend(i)

len(h_dataset)

2478

In [9]:
gpt_dataset = []
for i in [gpt_35_tofel_dataset, gpt_arxiv_dataset, gpt_essay_dataset, gpt_35_essay_cs_dataset]:
    gpt_dataset.extend(i)

len(gpt_dataset)

2478

In [10]:
len(h_essay_cs_dataset), len(gpt_35_essay_cs_dataset)

(387, 387)

In [12]:
dct = {
    "h_text": [item['input'] for item in h_dataset],
    "h_label": [item['label'] for item in h_dataset],
    "h_target": np.ones(len(h_dataset), dtype=int),
    "m_text": [item['input'] for item in gpt_dataset],
    "m_label": [item['label'] for item in gpt_dataset],
    "m_target": np.zeros(len(gpt_dataset), dtype=int),
}

df = pd.DataFrame(dct)
df

Unnamed: 0,h_text,h_label,h_target,m_text,m_label,m_target
0,"I prefer to cook at home. First of all, it is ...",human,1,I enjoy cooking at home for several reasons. F...,ai,0
1,The place I would like to visit most is the ou...,human,1,The place I would most like to visit is outer ...,ai,0
2,I imagine my life ten years in the future to b...,human,1,I envision my life a decade from now as comple...,ai,0
3,"Personally, I would like to say that the schoo...",human,1,"Personally, I have to say that the school that...",ai,0
4,I believe that I will follow my interest. I'm ...,human,1,I'm confident I will pursue my passion. I'm no...,ai,0
...,...,...,...,...,...,...
2473,"Computer Viruses: Spreading, Multiplying and D...",human,1,"Computer Viruses: Spreading, Multiplying, and ...",ai,0
2474,Purchasing or Leasing Computer Equipment: Adva...,human,1,Purchasing vs Leasing Computer Equipment: Pros...,ai,0
2475,Print and Broadcast Computer Advertisements Es...,human,1,Table of Contents\n 1. Print advertisement\n 2...,ai,0
2476,How computers influence our life Essay\n\nIntr...,human,1,Influence of Computers on Our Lives\n\nIntrodu...,ai,0


# Generator (Paraphraser)

In [13]:
class Generator():
    def __init__(self, model_name="t5-large", lr=0.1):
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        # hyperparameters
        self.lr = lr
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    
    def generate_text(self, input):
        # Paraphrase AI text make it better
        
        arguments = {"padding": True, "truncation": True, "return_tensors": "pt"}
        # encode and obtain ids
        encoded_inputs = self.tokenizer(input, **arguments).to(device)

        # generate
        outputs = self.model.generate(encoded_inputs.input_ids,
                                    max_new_tokens=200,
                                    # num_beams=5,
                                    # num_return_sequences=1,
                                    # temperature=1.5,
                                    # num_beam_groups=5,
                                    # diversity_penalty=2.0,
                                    # no_repeat_ngram_size=2,
                                    # early_stopping=True,
                                    # length_penalty=2.0
                                    )

        sequences_list = outputs.tolist()

        # decode 
        decoded_outputs = self.tokenizer.batch_decode(sequences_list, skip_special_tokens=True)
        return decoded_outputs


    # def update_model(self, Z, discriminator, loss):
    #     """Update generator."""
    #     batch_size = Z.shape[0]
    #     ones = torch.ones((batch_size,), device=device)
    #     self.optimizer.zero_grad()
    #     # We could reuse `fake_X` from `update_D` to save computation
    #     fake_X = self.__call__(Z)
    #     # Recomputing `fake_Y` is needed since `net_D` is changed
    #     fake_Y = discriminator(fake_X)
    #     loss_G = loss(fake_Y, ones.reshape(fake_Y.shape))
    #     loss_G.backward()
    #     self.optimizer.step()
    #     return loss_G
    

    def train_generator(self, fake_texts, discriminator, criterion):
        labels = torch.ones(len(fake_texts)).to(device)

        fake_inputs = discriminator.tokenizer(fake_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
        fake_outputs = discriminator.model(**fake_inputs)
        # fake_inputs = discriminator.tokenizer(
        #     fake_texts,  
        #     add_special_tokens=True, pad_to_max_length=True, max_length=512, return_token_type_ids=True, return_tensors=True
        # )
        # fake_inputs = fake_inputs.to(device)
        # fake_outputs = discriminator.model(**fake_inputs).to(device)

        loss = criterion(fake_outputs.logits[:, 1], labels)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
        # return loss


# Discriminator

In [14]:
class Discriminator():
    def __init__(self, model_name="roberta-base", lr=0.1):
        self.model = RobertaForSequenceClassification.from_pretrained(model_name).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # hyperparameters
        self.lr = lr
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    def classify_text(self, input):
        # given text, discriminate whether the given text is from human or ai
        """
        Assume the label y for the true data is 
            1 -> human
            0 -> AI
        """

        arguments = {"add_special_tokens": True, "pad_to_max_length": True, "max_length":512, "return_token_type_ids": True, "return_tensors": "pt"}
        encoded_inputs = self.tokenizer.encode_plus(input, None, **arguments)
        ids = encoded_inputs['input_ids'].to(device, dtype = torch.long)
        mask = encoded_inputs['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = encoded_inputs["token_type_ids"].to(device, dtype = torch.long)
        # targets = data['target'].to(device, dtype = torch.long)

        outputs = self.model(ids, mask, token_type_ids)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        # predicted_class = torch.argmax(probabilities).item()
        return logits, probabilities
    

    # def update_model(self, real_texts, Z, generator, loss):
    #     """Update discriminator."""
    #     batch_size = X.shape[0]
    #     ones = torch.ones((batch_size,), device=X.device)
    #     zeros = torch.zeros((batch_size,), device=X.device)
        
    #     real_inputs = self.tokenizer(real_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
    #     fake_X = generator(Z)

    #     real_outputs = self.classify_text(real_inputs)
    #     fake_outputs = self.classify_text(fake_X.detach())
        
    #     real_loss = loss(real_outputs, ones.reshape(real_outputs.shape))
    #     fake_loss = loss(fake_outputs, zeros.reshape(fake_outputs.shape))
    #     loss_D = (real_loss + fake_loss) / 2
        
    #     self.optimizer.zero_grad()
    #     loss_D.backward()
    #     self.optimizer.step()

    #     return loss_D
    
    def train_discriminator(self, real_texts, fake_texts, criterion):
        real_labels = torch.ones(len(real_texts)).to(device)
        fake_labels = torch.zeros(len(fake_texts)).to(device)

        real_inputs = self.tokenizer(real_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
        fake_inputs = self.tokenizer(fake_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

        real_outputs = self.model(**real_inputs)
        fake_outputs = self.model(**fake_inputs)

        real_loss = criterion(real_outputs.logits[:, 1], real_labels)
        fake_loss = criterion(fake_outputs.logits[:, 0], fake_labels)
        loss = (real_loss + fake_loss) / 2

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
        # return loss

# Simple Version

In [17]:
num_epochs=2
generator = Generator()
discriminator = Discriminator()
criterion = torch.nn.BCEWithLogitsLoss()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in tqdm(range(num_epochs)):
    # Assume you have a list of real texts and corresponding prompts
    real_texts = [item['input'] for item in h_tofel_dataset[:2]]
    prompts = ["Paraphrase: "+item['input'] for item in gpt_tofel_dataset[:2]]
    fake_texts = generator.generate_text(prompts)

    # Train Discriminator
    d_loss = discriminator.train_discriminator(real_texts, prompts, criterion)
    print(f"Epoch {epoch + 1}, Discriminator Loss: {d_loss}")

    # Train Generator
    g_loss = generator.train_generator(fake_texts, discriminator, criterion)
    print(f"Epoch {epoch + 1}, Generator Loss: {g_loss}")


# With Dataset & DataLoader

In [22]:
class GANDataset(Dataset):
    def __init__(self, df):
        self.h_text = df['h_text']
        self.h_target = df['h_target']
        self.m_text = df['m_text']
        self.m_target = df['m_target']

    def __len__(self):
        return len(self.h_text)

    def __getitem__(self, index):

        h_text = str(self.h_text[index])
        h_text = " ".join(h_text.split())

        m_text = str(self.m_text[index])
        m_text = " ".join(m_text.split())

        return {
            'real_texts': h_text,
            'real_target': torch.tensor(self.h_target[index], dtype=torch.float), 
            'fake_texts': m_text,
            'fake_target': torch.tensor(self.m_target[index], dtype=torch.float), 
        }

In [23]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8

In [24]:
train_size = 0.8
train_data=df.sample(frac=train_size, random_state=42)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

#### Dataset
training_set = GANDataset(train_data)
testing_set = GANDataset(test_data)

#### DataLoader 
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (2478, 6)
TRAIN Dataset: (1982, 6)
TEST Dataset: (496, 6)


In [27]:
next(iter(training_loader))

{'real_texts': ['Yelling in Past and Modern Society Essay Yelling, once associated with toughness, strength, and power, is now seen as imposing and aggressive, a negative side consequence of arrogance and unearthed advantage. With the widespread public reaction towards the online clip of Republican Majority Leader Jake Corman yelling at his Democratic opponent Kate Muth, the precarious position of yelling in modern American consciousness became apparent. Screaming is currently associated with toxic masculinity and anger issues that frequently stem from it. People who lose control and start yelling are met with a mixture of surprise and humiliation. It’s not an appropriate course of action for anyone who wishes to achieve public respect and be taken seriously. A large part of this movement is undoubtedly due to a wider trend away from conduct that promotes abusive and blatantly patriarchal structures. While there is no proof that men yell more than women, a man yelling is nearly always 

# Training

In [28]:
def train(epochs, generator, discriminator, training_loader, testing_loader):

    for epoch in range(1, epochs+1):
        with tqdm(training_loader, desc=f"Epoch {epoch}") as tepoch:
            for data in tepoch:
                real_texts = data['real_texts']
                prompts = ["".join("Paraphrase: "+real_text) for real_text in real_texts]
                fake_texts = generator.generate_text(prompts)
                
                # Train Discriminator
                d_loss = discriminator.train_discriminator(real_texts, prompts, criterion)
                print(f"Epoch {epoch}, Discriminator Loss: {d_loss}")

                # Train Generator
                g_loss = generator.train_generator(fake_texts, discriminator, criterion)
                print(f"Epoch {epoch}, Generator Loss: {g_loss}")

                tepoch.set_postfix(discriminator_loss=d_loss, generator_loss=g_loss)

In [31]:
def train(epochs, generator, discriminator, training_loader, testing_loader):

    for epoch in range(1, epochs+1):
        for data in training_loader:
            real_texts = data['real_texts']
            prompts = ["".join("Paraphrase: "+real_text) for real_text in real_texts]
            fake_texts = generator.generate_text(prompts)
            
            # Train Discriminator
            d_loss = discriminator.train_discriminator(real_texts, prompts, criterion)
            print(f"Epoch {epoch}, Discriminator Loss: {d_loss}")

            # Train Generator
            g_loss = generator.train_generator(fake_texts, discriminator, criterion)
            print(f"Epoch {epoch}, Generator Loss: {g_loss}")

In [None]:
epochs = 5

generator = Generator()
discriminator = Discriminator()
criterion = torch.nn.BCEWithLogitsLoss()

In [32]:
train(epochs, generator, discriminator, training_loader, testing_loader)

KeyboardInterrupt: 

In [65]:
# def train(discriminator, generator, data_iter, num_epochs, latent_dim, data):
#     loss = torch.nn.BCEWithLogitsLoss(reduction='sum')

#     for w in discriminator.model.parameters():
#         nn.init.normal_(w, 0, 0.02)
#     for w in generator.model.parameters():
#         nn.init.normal_(w, 0, 0.02)

#     animator = d2l.Animator(xlabel='epoch', ylabel='loss',
#                             xlim=[1, num_epochs], nrows=2, figsize=(5, 5),
#                             legend=['discriminator', 'generator'])
#     animator.fig.subplots_adjust(hspace=0.3)
#     for epoch in range(num_epochs):
#         # Train one epoch
#         timer = d2l.Timer()
#         metric = d2l.Accumulator(3)  # loss_D, loss_G, num_examples
#         for (X,) in data_iter:
#             batch_size = X.shape[0]
#             Z = torch.normal(0, 1, size=(batch_size, latent_dim))
#             metric.add(discriminator.update_model(X, Z, generator, loss),
#                        generator.update_model(Z, discriminator, loss),
#                        batch_size)
#         # Visualize generated examples
#         Z = torch.normal(0, 1, size=(100, latent_dim))
#         fake_X = generator(Z).detach().numpy()
#         animator.axes[1].cla()
#         animator.axes[1].scatter(data[:, 0], data[:, 1])
#         animator.axes[1].scatter(fake_X[:, 0], fake_X[:, 1])
#         animator.axes[1].legend(['real', 'generated'])
#         # Show the losses
#         loss_D, loss_G = metric[0]/metric[2], metric[1]/metric[2]
#         animator.add(epoch + 1, (loss_D, loss_G))
#     print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}, '
#           f'{metric[2] / timer.stop():.1f} examples/sec')

# MaliGAN

In [None]:
class LSTMGenerator(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, padding_idx, gpu=False):
        super(LSTMGenerator, self).__init__()
        self.name = 'vanilla'

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        self.gpu = gpu

        self.temperature = 1.0

        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.lstm2out = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

        self.init_params()

    def forward(self, inp, hidden, need_hidden=False):
        """
        Embeds input and applies LSTM
        :param inp: batch_size * seq_len
        :param hidden: (h, c)
        :param need_hidden: if return hidden, use for sampling
        """
        emb = self.embeddings(inp)  # batch_size * len * embedding_dim
        if len(inp.size()) == 1:
            emb = emb.unsqueeze(1)  # batch_size * 1 * embedding_dim

        out, hidden = self.lstm(emb, hidden)  # out: batch_size * seq_len * hidden_dim
        out = out.contiguous().view(-1, self.hidden_dim)  # out: (batch_size * len) * hidden_dim
        out = self.lstm2out(out)  # (batch_size * seq_len) * vocab_size
        # out = self.temperature * out  # temperature
        pred = self.softmax(out)

        if need_hidden:
            return pred, hidden
        else:
            return pred

    def sample(self, num_samples, batch_size, start_letter=cfg.start_letter):
        """
        Samples the network and returns num_samples samples of length max_seq_len.
        :return samples: num_samples * max_seq_length (a sampled sequence in each row)
        """
        num_batch = num_samples // batch_size + 1 if num_samples != batch_size else 1
        samples = torch.zeros(num_batch * batch_size, self.max_seq_len).long()

        # Generate sentences with multinomial sampling strategy
        for b in range(num_batch):
            hidden = self.init_hidden(batch_size)
            inp = torch.LongTensor([start_letter] * batch_size)
            if self.gpu:
                inp = inp.cuda()

            for i in range(self.max_seq_len):
                out, hidden = self.forward(inp, hidden, need_hidden=True)  # out: batch_size * vocab_size
                next_token = torch.multinomial(torch.exp(out), 1)  # batch_size * 1 (sampling from each row)
                samples[b * batch_size:(b + 1) * batch_size, i] = next_token.view(-1)
                inp = next_token.view(-1)
        samples = samples[:num_samples]

        return samples

    def init_params(self):
        for param in self.parameters():
            if param.requires_grad and len(param.shape) > 0:
                stddev = 1 / math.sqrt(param.shape[0])
                if cfg.gen_init == 'uniform':
                    torch.nn.init.uniform_(param, a=-0.05, b=0.05)
                elif cfg.gen_init == 'normal':
                    torch.nn.init.normal_(param, std=stddev)
                elif cfg.gen_init == 'truncated_normal':
                    truncated_normal_(param, std=stddev)

    def init_oracle(self):
        for param in self.parameters():
            if param.requires_grad:
                torch.nn.init.normal_(param, mean=0, std=1)

    def init_hidden(self, batch_size=cfg.batch_size):
        h = torch.zeros(1, batch_size, self.hidden_dim)
        c = torch.zeros(1, batch_size, self.hidden_dim)

        if self.gpu:
            return h.cuda(), c.cuda()
        else:
            return h, c

In [None]:
class CNNDiscriminator(nn.Module):
    def __init__(self, embed_dim, vocab_size, filter_sizes, num_filters, padding_idx, gpu=False,
                 dropout=0.2):
        super(CNNDiscriminator, self).__init__()
        self.embedding_dim = embed_dim
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        self.feature_dim = sum(num_filters)
        self.gpu = gpu

        self.embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, n, (f, embed_dim)) for (n, f) in zip(num_filters, filter_sizes)
        ])
        self.highway = nn.Linear(self.feature_dim, self.feature_dim)
        self.feature2out = nn.Linear(self.feature_dim, 2)
        self.dropout = nn.Dropout(dropout)

        self.init_params()

    def forward(self, inp):
        """
        Get final predictions of discriminator
        :param inp: batch_size * seq_len
        :return: pred: batch_size * 2
        """
        feature = self.get_feature(inp)
        pred = self.feature2out(self.dropout(feature))

        return pred

    def get_feature(self, inp):
        """
        Get feature vector of given sentences
        :param inp: batch_size * max_seq_len
        :return: batch_size * feature_dim
        """
        emb = self.embeddings(inp).unsqueeze(1)  # batch_size * 1 * max_seq_len * embed_dim
        convs = [F.relu(conv(emb)).squeeze(3) for conv in self.convs]  # [batch_size * num_filter * length]
        pools = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in convs]  # [batch_size * num_filter]
        pred = torch.cat(pools, 1)  # tensor: batch_size * feature_dim
        highway = self.highway(pred)
        pred = torch.sigmoid(highway) * F.relu(highway) + (1. - torch.sigmoid(highway)) * pred  # highway

        return pred

    def init_params(self):
        for param in self.parameters():
            if param.requires_grad and len(param.shape) > 0:
                stddev = 1 / math.sqrt(param.shape[0])
                if cfg.dis_init == 'uniform':
                    torch.nn.init.uniform_(param, a=-0.05, b=0.05)
                elif cfg.dis_init == 'normal':
                    torch.nn.init.normal_(param, std=stddev)
                elif cfg.dis_init == 'truncated_normal':
                    truncated_normal_(param, std=stddev)

In [None]:
class MaliGAN_G(LSTMGenerator):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, padding_idx, gpu=False):
        super(MaliGAN_G, self).__init__(embedding_dim, hidden_dim, vocab_size, max_seq_len, padding_idx, gpu)
        self.name = 'maligan'

    def adv_loss(self, inp, target, reward):
        """
        Returns a MaliGAN loss

        :param inp: batch_size x seq_len, inp should be target with <s> (start letter) prepended
        :param target: batch_size x seq_len
        :param reward: batch_size (discriminator reward for each sentence, applied to each token of the corresponding sentence)
        :return loss: policy loss
        """

        batch_size, seq_len = inp.size()
        hidden = self.init_hidden(batch_size)

        out = self.forward(inp, hidden).view(batch_size, self.max_seq_len, self.vocab_size)
        target_onehot = F.one_hot(target, self.vocab_size).float()  # batch_size * seq_len * vocab_size
        pred = torch.sum(out * target_onehot, dim=-1)  # batch_size * seq_len
        loss = -torch.sum(pred * reward)

        return loss

In [None]:
class MaliGAN_D(CNNDiscriminator):
    def __init__(self, embed_dim, vocab_size, padding_idx, gpu=False, dropout=0.25):
        super(MaliGAN_D, self).__init__(embed_dim, vocab_size, dis_filter_sizes, dis_num_filters, padding_idx, gpu,
                                        dropout)

# Backup

In [25]:
class SentimentData(Dataset):
    def __init__(self, df, tokenizer_G, tokenizer_D, max_len):
        self.tokenizer_G = tokenizer_G
        self.tokenizer_D = tokenizer_D
        self.max_len = max_len
        self.h_text = df['h_text']
        self.h_target = df['h_target']
        self.m_text = df['m_text']
        self.m_target = df['m_target']

    def __len__(self):
        return len(self.h_text)

    def set_human(self, index):
        text = str(self.h_text[index])
        text = " ".join(text.split())

         # generator inputs
        arguments_G = {"add_special_tokens": True, "pad_to_max_length": True, "max_length":self.max_len, "return_token_type_ids": True}
        inputs_G = self.tokenizer_G.encode_plus(text, None, **arguments_G)

        # discriminator inputs
        arguments_D = {"add_special_tokens": True, "pad_to_max_length": True, "max_length":self.max_len, "return_token_type_ids": True}
        inputs_D = self.tokenizer_D.encode_plus(text, None, **arguments_D)

        return {
            'h_ids_G': torch.tensor(inputs_G['input_ids'], dtype=torch.long),
            'h_mask_G': torch.tensor(inputs_G['attention_mask'], dtype=torch.long),
            'h_ids_D': torch.tensor(inputs_D['input_ids'], dtype=torch.long),
            'h_mask_D': torch.tensor(inputs_D['attention_mask'], dtype=torch.long),
            'h_token_type_ids_D': torch.tensor(inputs_D["token_type_ids"], dtype=torch.long),
            'h_target': torch.tensor(self.h_target[index], dtype=torch.float)
        }

    def set_ai(self, index):
        text = str(self.m_text[index])
        text = " ".join(text.split())

         # generator inputs
        arguments_G = {"add_special_tokens": True, "pad_to_max_length": True, "max_length":self.max_len, "return_token_type_ids": True}
        inputs_G = self.tokenizer_G.encode_plus(text, None, **arguments_G)

        # discriminator inputs
        arguments_D = {"add_special_tokens": True, "pad_to_max_length": True, "max_length":self.max_len, "return_token_type_ids": True}
        inputs_D = self.tokenizer_D.encode_plus(text, None, **arguments_D)

        return {
            'm_ids_G': torch.tensor(inputs_G['input_ids'], dtype=torch.long),
            'm_mask_G': torch.tensor(inputs_G['attention_mask'], dtype=torch.long),
            'm_ids_D': torch.tensor(inputs_D['input_ids'], dtype=torch.long),
            'm_mask_D': torch.tensor(inputs_D['attention_mask'], dtype=torch.long),
            'm_token_type_ids_D': torch.tensor(inputs_D["token_type_ids"], dtype=torch.long),
            'm_target': torch.tensor(self.m_target[index], dtype=torch.float)
        }

    def __getitem__(self, index):
        human_ = self.set_human(index)
        ai_ = self.set_ai(index)
        return {
            'real_texts': human_, 
            'fake_texts': ai_
        }

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8

In [None]:
tokenizer_G = T5Tokenizer.from_pretrained("t5-large")
tokenizer_D = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
train_size = 0.8
train_data=df.sample(frac=train_size, random_state=42)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

#### Dataset
training_set = SentimentData(train_data, tokenizer_G, tokenizer_D, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer_G, tokenizer_D, MAX_LEN)

#### DataLoader 
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# class_counts = df['target'].value_counts()
# majority_class = class_counts.idxmax()
# minority_class = class_counts.idxmin()

# # Separate majority and minority classes
# majority_df = df[df['target'] == majority_class]
# minority_df = df[df['target'] == minority_class]

# # Undersample majority class
# undersampled_majority_df = resample(majority_df,
#                                     replace=False,  # Sample without replacement
#                                     n_samples=len(minority_df),  # Match minority class size
#                                     random_state=42)  # For reproducibility

# # Combine minority class with undersampled majority class
# undersampled_df = pd.concat([undersampled_majority_df, minority_df])
# undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
# df = undersampled_df
# df

In [None]:
# human_df = df[df['label'] == "human"]
# ai_df = df[df['label'] == "ai"]

# human_df = human_df.sample(frac=1).reset_index(drop=True)
# ai_df = ai_df.sample(frac=1).reset_index(drop=True)

# train_ratio = 0.8
# test_ratio = 0.2

# total_size = len(human_df)
# train_size = int(train_ratio * total_size)
# test_size = total_size - train_size

# print(total_size, train_size, test_size)


# human_train_df = human_df[:train_size]
# human_valid_df = human_df[train_size:]
# ai_train_df = ai_df[:train_size]
# ai_valid_df = ai_df[train_size:]
