#TextGAN for synthetic Text Generation 



In [None]:
#reading the data
import pandas as pd

In [None]:
df = pd.read_csv('Twitter_Data.csv')
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [None]:
#verifying the number of samples for imbalance
print(df['category'].value_counts())

 1.0    13569
 0.0    10942
-1.0     7492
Name: category, dtype: int64


In [None]:
df["clean_text"]

0        when modi promised “minimum government maximum...
1        talk all the nonsense and continue all the dra...
2        what did just say vote for modi  welcome bjp t...
3        asking his supporters prefix chowkidar their n...
4        answer who among these the most powerful world...
                               ...                        
31999      yup looks more like older version raga den modi
32000    modi have punished lots corrupt but cant remember
32001    religion peace agenda clear hai chahe padhe li...
32002    chowkidar narendra modi was doing such stellar...
32003    joshi should fielded joint opposition candidat...
Name: clean_text, Length: 32004, dtype: object

In [None]:
#dropping nan values
df = df.dropna(subset=['clean_text'])

In [None]:
#data pre processing
import re

def clean_text(text):
    text = re.sub(r'http\S+', '', text) # remove urls
    text = re.sub(r'@[^\s]+', '', text) # remove mentions
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove hashtags
    return text.strip()

df = df[['category', 'clean_text']]
df['clean_text'] = df['clean_text'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the tokenizer and the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)

# Define the maximum length of the input text and the batch size
max_len = 128
batch_size = 32

# Define a custom dataset class
class TwitterDataset(Dataset):
    def __init__(self, df, max_len, tokenizer):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        tweet = self.df.iloc[index]['clean_text']
        label = self.df.iloc[index]['category']
        
        # Tokenize the tweet
        inputs = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Convert the label to an integer
        if label == 'Positive':
            label = 2
        elif label == 'Negative':
            label = 0
        else:
            label = 1
            
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()

        # Return a tuple containing the input_ids, attention_mask, token_type_ids, and label tensors
        return input_ids, attention_mask, token_type_ids, torch.tensor(label, dtype=torch.long)

# Create the dataloaders
train_dataset = TwitterDataset(train_df, max_len=784, tokenizer=tokenizer)
val_dataset = TwitterDataset(val_df, max_len=784, tokenizer=tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Define the generator model
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim
        self.fc1 = nn.Linear(self.latent_dim, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 512)
        self.fc4 = nn.Linear(512, self.output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(real_data['input_ids'].shape)

torch.Size([32, 128])


In [None]:
print(fake_data.shape)

torch.Size([32, 784])


In [None]:
fake_data

tensor([[4.1309e-06, 9.9975e-01, 1.0000e+00,  ..., 4.0488e-08, 2.9373e-07,
         4.8612e-08],
        [1.8430e-05, 9.9945e-01, 1.0000e+00,  ..., 2.7292e-07, 1.6756e-06,
         3.0052e-07],
        [1.4357e-06, 9.9988e-01, 1.0000e+00,  ..., 9.8435e-09, 8.3949e-08,
         1.3186e-08],
        ...,
        [1.3064e-05, 9.9953e-01, 1.0000e+00,  ..., 1.5409e-07, 9.4969e-07,
         1.8482e-07],
        [6.6278e-06, 9.9966e-01, 1.0000e+00,  ..., 7.6031e-08, 4.9770e-07,
         1.0738e-07],
        [1.5767e-04, 9.9727e-01, 9.9995e-01,  ..., 5.6497e-06, 2.3490e-05,
         7.4585e-06]], grad_fn=<SigmoidBackward0>)

In [None]:
import torch.nn.functional as F

# Define the generator and discriminator
generator = Generator(latent_dim=100, output_dim=28*28).to(device)
discriminator = Discriminator(input_dim=28*28, hidden_dim=256, output_dim=1).to(device)

# Define the loss function and optimizer for the discriminator
criterion = nn.BCEWithLogitsLoss()
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

# Define the loss function and optimizer for the generator
g_optimizer = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))

# Define the number of epochs to train for
num_epochs = 1

# Loop over the epochs
for epoch in range(num_epochs):
    # Loop over the batches in the data loader
    
    for batch_idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_dataloader):
        # Move the data to the device (CPU or GPU)
        input_ids = input_ids.to(device).float()  # Convert to float data type
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        
        ##############################
        # Train the discriminator
        ##############################
        
        # Reset the gradients for the discriminator
        d_optimizer.zero_grad()
        
        # Generate a batch of fake data
        z = torch.randn(batch_size, 100).to(device)
        fake_data = generator(z)
        
        # Get the discriminator's output for the real and fake data
        # Get the discriminator's output for the real and fake data
        d_real = discriminator(input_ids.view(-1, 28*28))
        d_fake = discriminator(fake_data.detach().view(-1, 28*28))

        
        # Calculate the loss for the discriminator
        d_real_loss = criterion(d_real, torch.ones_like(d_real))
        d_fake_loss = criterion(d_fake, torch.zeros_like(d_fake))
        d_loss = (d_real_loss + d_fake_loss) / 2
        
        # Backpropagate the gradients and update the weights for the discriminator
        d_loss.backward()
        d_optimizer.step()
        
        ##############################
        # Train the generator
        ##############################
        
        # Reset the gradients for the generator
        g_optimizer.zero_grad()
        
        # Generate a new batch of fake data
        z = torch.randn(batch_size, 100).to(device)
        fake_data = generator(z)
        
        # Get the discriminator's output for the fake data
        d_fake = discriminator(fake_data.view(-1, 28*28))
        
        # Calculate the loss for the generator
        g_loss = criterion(d_fake, torch.ones_like(d_fake))
        
        # Backpropagate the gradients and update the weights for the generator
        g_loss.backward()
        g_optimizer.step()
        
        # Print out some statistics every few batches
        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_dataloader)}], D_loss: {d_loss.item():.4f}, G_loss: {g_loss.item():.4f}")



Epoch [1/1], Batch [10/801], D_loss: 0.2418, G_loss: 0.9905
Epoch [1/1], Batch [20/801], D_loss: 0.1998, G_loss: 1.1477
Epoch [1/1], Batch [30/801], D_loss: 0.2738, G_loss: 0.8730
Epoch [1/1], Batch [40/801], D_loss: 0.3044, G_loss: 0.7943
Epoch [1/1], Batch [50/801], D_loss: 0.3187, G_loss: 0.7493
Epoch [1/1], Batch [60/801], D_loss: 0.3280, G_loss: 0.7332
Epoch [1/1], Batch [70/801], D_loss: 0.3285, G_loss: 0.7304
Epoch [1/1], Batch [80/801], D_loss: 0.3338, G_loss: 0.7190
Epoch [1/1], Batch [90/801], D_loss: 0.3317, G_loss: 0.7243
Epoch [1/1], Batch [100/801], D_loss: 0.3288, G_loss: 0.7303
Epoch [1/1], Batch [110/801], D_loss: 0.3278, G_loss: 0.7336
Epoch [1/1], Batch [120/801], D_loss: 0.3273, G_loss: 0.7333
Epoch [1/1], Batch [130/801], D_loss: 0.3240, G_loss: 0.7412
Epoch [1/1], Batch [140/801], D_loss: 0.3218, G_loss: 0.7461
Epoch [1/1], Batch [150/801], D_loss: 0.3199, G_loss: 0.7505
Epoch [1/1], Batch [160/801], D_loss: 0.3169, G_loss: 0.7572
Epoch [1/1], Batch [170/801], D_l

In [None]:

# Get the discriminator's output for the real data
d_real = discriminator(input_ids.view(-1, 28*28))

# Generate a batch of fake data
z = torch.randn(batch_size, 100).to(device)
fake_data = generator(z)

# Get the discriminator's output for the fake data
d_fake = discriminator(fake_data.detach().view(-1, 28*28))

# Calculate the binary cross-entropy loss for the real and fake data
loss_real = criterion(d_real, torch.ones_like(d_real))
loss_fake = criterion(d_fake, torch.zeros_like(d_fake))

# Compute the total loss as the sum of the losses for real and fake data
total_loss = loss_real + loss_fake


In [None]:
total_loss

tensor(0.3940, grad_fn=<AddBackward0>)