In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from joblib import load
import scipy
from sklearn.metrics import classification_report
import joblib
from joblib import load

### Preparing to reproduce negative category values

In [2]:
device = 'cpu'

In [3]:
df = pd.read_csv("twitter_cleaned.csv")
df.dropna(inplace=True)

In [5]:
df = df.sample(frac=1).reset_index() #shuffling data in pandas

In [8]:
df.groupby(['label']).size()

label
negative     7748
neutral     22255
positive    19672
dtype: int64

In [9]:
df_class_nega = df[df["category"]==0] 
df_class_nega

Unnamed: 0,index,label,message,category
1,25286,negative,misleading headline but may this divided ancho...,0
12,17637,negative,eid holiday completely compromised by guest am...,0
16,48750,negative,extremely jealous of owains day off tomorrow i...,0
19,49416,negative,why every time twitter email me saying we foun...,0
23,21588,negative,also i see dustin johnson is leading after the...,0
...,...,...,...,...
49619,1791,negative,pritishnandy yakub arranged fund for the blast...,0
49620,11761,negative,well done chris evans i did not think it wa po...,0
49634,13455,negative,i never thought kanye west announcing a crack ...,0
49656,6660,negative,i will be so good at the military your head wi...,0


In [10]:
tfidf_model = load("tfidf_vectorizer.joblib")


In [11]:
mat_negative = tfidf_model.transform(df_class_nega['message'])

In [12]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, num_features=400, num_dim=2000):
        super(VariationalAutoencoder, self).__init__()
        
        self.num_features = num_features
        self.num_dim = num_dim
        
        self.encoder_layer_1 = nn.Linear(in_features=self.num_dim, out_features=800)
        self.encoder_layer_2 = nn.Linear(in_features=800, out_features=(self.num_features * 2))
        
        self.decoder_layer_1 = nn.Linear(in_features=self.num_features, out_features=800)
        self.decoder_layer_2 = nn.Linear(in_features=800, out_features=self.num_dim)
        
        self.relu = nn.ReLU() # hidden layers
        self.sigmoid = nn.Sigmoid() # output layer
        
    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std)  # `randn_like` as we need the same size
        sample = mu + (eps * std)    # sampling as if coming from the input space
        
        return sample
    
    def encode(self, x):
        # encoding
        #x = F.relu(self.encoder_layer_1(x))
        x = self.encoder_layer_1(x)
        x = self.relu(x)
        x = self.encoder_layer_2(x).view(-1, 2, self.num_features)
        
        # get `mu` and `log_var`
        mu = x[:, 0, :] # the first feature values as mean
        log_var = x[:, 1, :] # the other feature values as variance
        
        # get the latent vector through reparameterization
        z = self.reparameterize(mu, log_var)
        
        return z, mu, log_var
    
    def decode(self, z, mu, log_var):
        # decoding
        #x = F.relu(self.decoder_layer_1(z))
        x = self.decoder_layer_1(z)
        x = self.relu(x)
        
        #reconstruction = torch.sigmoid(self.decoder_layer_2(x))
        reconstruction = self.decoder_layer_2(x)
        reconstruction = self.sigmoid(reconstruction)
        
        return reconstruction, mu, log_var
    
    # Utility function to generate new data based on:
    # mu: The average that you want to have (should be the same size as num_features)
    # log_var: The variance that you want to have (should be the same size as num_features)
    def sample(self, mu, log_var):
        z = self.reparameterize(mu, log_var)
        reconstruction, mu, log_var = self.decode(z, mu, log_var)
        
        return reconstruction
        
    
    def forward(self, x):
        
        z, mu, log_var = self.encode(x)
        reconstruction, mu, log_var = self.decode(z, mu, log_var)
        
        return reconstruction, mu, log_var

In [13]:
class AutoencoderDataset(Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)
    
    # Requires you to return data as a pair of _x, _y
    def __getitem__(self, index):
        return self.x[index], self.x[index]

In [14]:
def final_loss(bce_loss, mu, logvar):
    """
    This function will add the reconstruction loss (BCELoss) and the 
    KL-Divergence.
    KL-Divergence = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    :param bce_loss: recontruction loss
    :param mu: the mean from the latent vector
    :param logvar: log variance from the latent vector
    """
    BCE = bce_loss 
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [15]:
num_features = 400
model = VariationalAutoencoder(num_features=num_features)

In [16]:
learning_rate = 0.001
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
batch_size = 16

In [17]:
def train_fn(loader, model, optimizer, loss_fn, batch_size):
    loop = tqdm(loader)
    
    count = 0
    ave_loss = 0.00
    
    # Loop per batch
    for batch_idx, (data, targets) in enumerate(loop):
        reconstruction, mu, logvar = model.forward(data)
        
        loss = loss_fn(reconstruction, targets)
        
        loss = final_loss(loss, mu, logvar)
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
        
        ave_loss += loss.item()
        count += 1
        
    ave_loss = ave_loss / count
    
    return ave_loss

In [20]:
x = torch.tensor(scipy.sparse.csr_matrix.todense(mat_negative)).float()

# use Dataloader for Autoencoder 
custom_dataset = AutoencoderDataset(x)

train_loader = DataLoader(
    custom_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

In [21]:
epochs = 20
losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    
    ave_loss = train_fn(
        train_loader,
        model,
        optimizer,
        loss_fn,
        batch_size
    )
    
    losses.append(ave_loss)
    
    print("Ave Loss: {}".format(ave_loss))
    

Epoch: 0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:44<00:00, 10.95it/s, loss=0.0134]


Ave Loss: 0.05047375777371458
Epoch: 1


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:41<00:00, 11.58it/s, loss=0.0132]


Ave Loss: 0.012379619227626275
Epoch: 2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:49<00:00,  9.71it/s, loss=0.0132]


Ave Loss: 0.011702482388882907
Epoch: 3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:51<00:00,  9.34it/s, loss=0.0131]


Ave Loss: 0.011650008293464012
Epoch: 4


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:52<00:00,  9.17it/s, loss=0.0131]


Ave Loss: 0.01166474326638524
Epoch: 5


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:52<00:00,  9.22it/s, loss=0.0132]


Ave Loss: 0.011695637363825262
Epoch: 6


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:56<00:00,  8.59it/s, loss=0.0135]


Ave Loss: 0.011735415182162806
Epoch: 7


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:54<00:00,  8.87it/s, loss=0.0131]


Ave Loss: 0.011775967665016651
Epoch: 8


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:52<00:00,  9.30it/s, loss=0.0134]


Ave Loss: 0.011825546624196558
Epoch: 9


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:55<00:00,  8.71it/s, loss=0.0134]


Ave Loss: 0.011864930710073598
Epoch: 10


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:49<00:00,  9.70it/s, loss=0.0135]


Ave Loss: 0.011905049240773485
Epoch: 11


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:54<00:00,  8.93it/s, loss=0.0136]


Ave Loss: 0.011935234271450755
Epoch: 12


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:53<00:00,  8.99it/s, loss=0.013]


Ave Loss: 0.011938508991728124
Epoch: 13


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:55<00:00,  8.79it/s, loss=0.0138]


Ave Loss: 0.011952210968533128
Epoch: 14


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:52<00:00,  9.26it/s, loss=0.0132]


Ave Loss: 0.011975367935662416
Epoch: 15


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:51<00:00,  9.49it/s, loss=0.0133]


Ave Loss: 0.011973863323555165
Epoch: 16


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:51<00:00,  9.41it/s, loss=0.0142]


Ave Loss: 0.012005700443669692
Epoch: 17


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:48<00:00,  9.90it/s, loss=0.0133]


Ave Loss: 0.012014317191830002
Epoch: 18


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:48<00:00,  9.91it/s, loss=0.0135]


Ave Loss: 0.012009223099298699
Epoch: 19


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:47<00:00, 10.31it/s, loss=0.0139]

Ave Loss: 0.012022775171574244





In [22]:
with open("negative_autoencoder.joblib", "wb+") as filename:
    joblib.dump(model, filename)

In [23]:
# Create a vector of zero tensors representing 0 average per num_feature (right in the middle of the dist)
sampled_mu = torch.Tensor([np.zeros(num_features)])

# Create a vector of zero tensors representing 0 standard deviations away from the mean to create variations
# Change this is you want to sample away from the mean to create "off-quality" data
sampled_logvar = torch.Tensor([np.zeros(num_features)])

  sampled_mu = torch.Tensor([np.zeros(num_features)])


In [24]:
data_gen = []
for i in range(15000):
    reconstruction = model.sample(sampled_mu, sampled_logvar)
    reconstructed = reconstruction[0].detach().cpu().numpy()
    data_gen.append(reconstructed)

In [25]:
mat_negative = pd.DataFrame(data_gen, columns=[i for i in range(2000)])
# df_classM_gen.rename(columns={32:"1_B", 33:"1_M"},inplace=True)


In [26]:
mat_negative['category'] = 0

In [27]:
mat_neutral = tfidf_model.transform(df[df['category']==1]['message'])

In [28]:
mat_neutral = pd.DataFrame(scipy.sparse.csr_matrix.todense(mat_neutral))



In [29]:
mat_neutral['category'] = 1

In [30]:
mat_positive = tfidf_model.transform(df[df['category']==2]['message'])

In [31]:
mat_positive = pd.DataFrame(scipy.sparse.csr_matrix.todense(mat_positive))


In [32]:
mat_positive['category'] = 2

In [33]:
final_df = pd.concat([mat_negative, mat_neutral, mat_positive], axis = 0)


In [35]:
final_df.to_csv('twitter_class.csv', index=False)

In [34]:
final_df.groupby(['category']).size()

category
0    15000
1    22255
2    19672
dtype: int64