In [1]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from collections import Counter
from itertools import chain,combinations

In [1]:
# https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6

from collect_data import load_cache
from tokenizer import load_tokens

tokens = load_tokens('/data/blockchain-interoperability/blockchain-social-media/analysis/twitter/tokens/text')
data = load_cache('/data/blockchain-interoperability/blockchain-social-media/analysis/twitter/snapshots')

data['clean_text'] = [' '.join(t) for t in tokens]
data.sort_values('timestamp_ms')

loading tokens...:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

In [1]:
import pickle

tokens = pickle.load(open('tokens/text/00100000.pkl','rb'))

In [2]:
from embeddings import get_sbert_embedding

ebs = get_sbert_embedding(tokens,'all-MiniLM-L6-v2')

minibatch..:   0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
from torch import nn
import torch.nn.functional as F

class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
       
        #Encoder

        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2,2),
            nn.Conv2d(16, 4, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2,2),
        )
        # nn.MaxPool2d(2, 2)
       
        #Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(4, 16, 2, stride=2),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(16, 3, 2, stride=2),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [12]:
class LinearAutoEncoder(nn.Module):
    def __init__(self):
        super(LinearAutoEncoder, self).__init__()
       
        #Encoder

        self.encoder = nn.Sequential(
            nn.Linear(384, 200),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(2,2),
            nn.Linear(200,50),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(2,2),
        )
        # nn.MaxPool2d(2, 2)
       
        #Decoder
        self.decoder = nn.Sequential(
            nn.Linear(50, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200,384),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))
    

In [80]:
import torch

def _flatten(self, h, batch_size):
    # (num_layers*num_directions, batch_size, hidden_dim)  ==>
    # (batch_size, num_directions*num_layers, hidden_dim)  ==>
    # (batch_size, num_directions*num_layers*hidden_dim)
    return h.transpose(0,1).contiguous().view(batch_size, -1)


class LSTMEncoder(nn.Module):
    def __init__(self):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(
            384,
            50,
            num_layers = 2,
            bidirectional = True,
            dropout=.1,
            batch_first=True
        )
        # math: num_layer * num_directions, batch_size, hidden_dim
        self.hidden = (torch.zeros(2 * 2, 100000, 50).cuda(),
                        torch.zeros(2 * 2, 100000, 50).cuda())
    def forward(self,x):
        batch_size = x.size(0)
        _,self.hidden = self.lstm(x,self.hidden)
        x = self.torch.cat([_flatten(self.hidden[0],batch_size),_flatten(self.hidden[1],batch_size)])
        return x
        

    


class LSTMDecoder:
    def __init__(self):
        super(LSTMDecoder,self).__init__()
        self.lstm = nn.LSTM(
            384,
            50,
            num_layers = 2,
            bidirectional = True,
            dropout=0.1,
            batch_first=True
        )
        self.hidden = (torch.zeros(2 * 2, 100000, 50).cuda(),
                        torch.zeros(2 * 2, 100000, 50).cuda())
    def forward(self,x):
        batch_size = x.size(0)
        _,self.hidden = self.lstm(x,self.hidden)
        x = self.torch.cat([_flatten(self.hidden[0],batch_size),_flatten(self.hidden[1],batch_size)])
        return x




class LSTMAutoEncoder(nn.Module):
    def __init__(self):
        super(LSTMAutoEncoder, self).__init__()
        self.encoder = LSTMEncoder()
        self.decoder = LSTMDecoder()

    def forward(self,x):
        z = self.encoder(x)
        return self.decoder(x,z)
        

In [3]:
import pickle

pickle.load(open('sentiment/vader.pkl','rb'))

[]

In [1]:
from dataset import load_dataset

dset = load_dataset(
    'snapshots',
    'tokens/text',
    'vader'
)

reading in snapshots..:   0%|          | 0/221 [00:00<?, ?it/s]

loading tokens...:   0%|          | 0/221 [00:00<?, ?it/s]

vader sentiment scores..:   0%|          | 0/22021502 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [88]:
import torch
hh = (torch.zeros(2 * 2, 100000, 50).cuda(),
torch.zeros(2 * 2, 100000, 50).cuda())


ll = nn.LSTM(384,50,num_layers = 2,bidirectional=True).cuda()

_,hhh = ll(ebs[:,:,None],hh)

RuntimeError: input.size(-1) must be equal to input_size. Expected 384, got 1

In [87]:
ebs[:,:,None].shape

torch.Size([100000, 384, 1])

In [81]:
le = LSTMEncoder().cuda()

In [82]:
le(ebs)

RuntimeError: input must have 3 dimensions, got 2

In [13]:
cenc = ConvAutoencoder()
lenc = LinearAutoEncoder()

In [20]:
lenc = lenc.cuda()
ebs = ebs.cuda()

from tqdm.auto import tqdm
from torch.optim import Adam

criterion = nn.MSELoss()
optimizer = Adam(lenc.parameters())

for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    out = lenc(ebs)
    loss = criterion(out,ebs)
    loss.backward()
    optimizer.step()
    print('loss:',loss.item())

  0%|          | 0/100 [00:00<?, ?it/s]

loss: 0.002409222535789013
loss: 0.004125208128243685
loss: 0.0023909281007945538
loss: 0.002432329813018441
loss: 0.002463673474267125
loss: 0.0024721170775592327
loss: 0.0024827364832162857
loss: 0.0024854594375938177
loss: 0.00248575652949512
loss: 0.0024866077583283186
loss: 0.002486886689439416
loss: 0.0024863595608621836
loss: 0.0024846415035426617
loss: 0.002483841497451067
loss: 0.002484808675944805
loss: 0.002484197961166501
loss: 0.0024825020227581263
loss: 0.002481407020241022
loss: 0.002481244271621108
loss: 0.0024815055076032877
loss: 0.002481007482856512
loss: 0.0024796975776553154
loss: 0.002478713868185878
loss: 0.0024784798733890057
loss: 0.0024782337713986635
loss: 0.002477183472365141
loss: 0.0024760758969932795
loss: 0.002475673332810402
loss: 0.0024754612240940332
loss: 0.002474902430549264
loss: 0.0024742414243519306
loss: 0.0024734698235988617
loss: 0.002472821855917573
loss: 0.0024727489799261093
loss: 0.0024716167245060205
loss: 0.002471020445227623
loss: 0.002

(tensor([[[-0.0142, -0.0514, -0.0092,  ..., -0.0590,  0.0263, -0.0186],
          [-0.0144, -0.0498, -0.0098,  ..., -0.0611,  0.0287, -0.0238],
          [-0.0161, -0.0476, -0.0110,  ..., -0.0630,  0.0261, -0.0264],
          ...,
          [-0.0186, -0.0546, -0.0111,  ..., -0.0615,  0.0250, -0.0196],
          [-0.0160, -0.0532, -0.0101,  ..., -0.0651,  0.0306, -0.0196],
          [-0.0153, -0.0519, -0.0112,  ..., -0.0588,  0.0262, -0.0187]]],
        device='cuda:0', grad_fn=<CudnnRnnBackward0>),
 (tensor([[[ 0.0736, -0.0770,  0.0472,  ...,  0.0218,  0.0110,  0.0582],
           [ 0.0399, -0.0315,  0.0418,  ...,  0.0015, -0.0241,  0.0735],
           [ 0.0694, -0.0744,  0.0485,  ..., -0.0373, -0.0011,  0.0570],
           ...,
           [ 0.0538, -0.0286,  0.0435,  ...,  0.0115, -0.0127,  0.0633],
           [ 0.0478, -0.0478,  0.0557,  ..., -0.0201, -0.0188,  0.0618],
           [ 0.0744, -0.0674,  0.0315,  ...,  0.0036, -0.0265,  0.0644]],
  
          [[-0.0142, -0.0514, -0.0092,

In [22]:
isinstance((1,1),(2,2))

TypeError: isinstance() arg 2 must be a type or tuple of types