In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from collections import Counter
from itertools import chain,combinations

In [1]:
# https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6

from collect_data import load_cache
from tokenizer import load_tokens

tokens = load_tokens('/data/blockchain-interoperability/blockchain-social-media/analysis/twitter/tokens/text')
data = load_cache('/data/blockchain-interoperability/blockchain-social-media/analysis/twitter/snapshots')

data['clean_text'] = [' '.join(t) for t in tokens]
data.sort_values('timestamp_ms')

loading tokens...:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

In [1]:
import pandas as pd
from tqdm.auto import tqdm

sample = pd.read_pickle('snapshots/00100000.pkl')


In [2]:
sample.whole_text[[1,2]]

1    Never seen such a deal for a punk since 2017! ...
2    Collection: ens \n Sold for: 1.0040189 Eth (12...
Name: whole_text, dtype: object

In [8]:
from embeddings import load_embeddings
from tqdm.auto import tqdm
from pathlib import Path
import torch
# ee = load_embeddings()

ee = [
    torch.load(f) 
    for f in tqdm(
        sorted(
            Path('embeddings/all-MiniLM-L6-v2/').glob('*.pkl')
        ),
        desc='loading embeddings..',
        leave=False
    )
]

loading embeddings..:   0%|          | 0/221 [00:00<?, ?it/s]

In [None]:
from dataset import TwitterDataset
dset = TwitterDataset(
    'snapshots',
    'tokens/text/',
    'embeddings/all-MiniLM-L6-v2/',
    'sentiment/vader/'
    '2022-11-09 06:00:00.000000'
    '2022-11-10 06:00:00.000000'    
)

reading in snapshots..:   0%|          | 0/221 [00:00<?, ?it/s]

In [4]:
from torch import nn
import torch.nn.functional as F

class ConvAutoencoder(nn.Module):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
       
        #Encoder

        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2,2),
            nn.Conv2d(16, 4, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2,2),
        )
        # nn.MaxPool2d(2, 2)
       
        #Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(4, 16, 2, stride=2),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(16, 3, 2, stride=2),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [12]:
class LinearAutoEncoder(nn.Module):
    def __init__(self):
        super(LinearAutoEncoder, self).__init__()
       
        #Encoder

        self.encoder = nn.Sequential(
            nn.Linear(384, 200),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(2,2),
            nn.Linear(200,50),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(2,2),
        )
        # nn.MaxPool2d(2, 2)
       
        #Decoder
        self.decoder = nn.Sequential(
            nn.Linear(50, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200,384),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))
    

In [20]:
lenc = lenc.cuda()
ebs = ebs.cuda()

from tqdm.auto import tqdm
from torch.optim import Adam

criterion = nn.MSELoss()
optimizer = Adam(lenc.parameters())

for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    out = lenc(ebs)
    loss = criterion(out,ebs)
    loss.backward()
    optimizer.step()
    print('loss:',loss.item())

  0%|          | 0/100 [00:00<?, ?it/s]

loss: 0.002409222535789013
loss: 0.004125208128243685
loss: 0.0023909281007945538
loss: 0.002432329813018441
loss: 0.002463673474267125
loss: 0.0024721170775592327
loss: 0.0024827364832162857
loss: 0.0024854594375938177
loss: 0.00248575652949512
loss: 0.0024866077583283186
loss: 0.002486886689439416
loss: 0.0024863595608621836
loss: 0.0024846415035426617
loss: 0.002483841497451067
loss: 0.002484808675944805
loss: 0.002484197961166501
loss: 0.0024825020227581263
loss: 0.002481407020241022
loss: 0.002481244271621108
loss: 0.0024815055076032877
loss: 0.002481007482856512
loss: 0.0024796975776553154
loss: 0.002478713868185878
loss: 0.0024784798733890057
loss: 0.0024782337713986635
loss: 0.002477183472365141
loss: 0.0024760758969932795
loss: 0.002475673332810402
loss: 0.0024754612240940332
loss: 0.002474902430549264
loss: 0.0024742414243519306
loss: 0.0024734698235988617
loss: 0.002472821855917573
loss: 0.0024727489799261093
loss: 0.0024716167245060205
loss: 0.002471020445227623
loss: 0.002