In [9]:
import torch
from transformers import AutoModel, AutoTokenizer 
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tweets = pd.read_csv('all_china_full.csv')
tweets = tweets.dropna() # some rows come in as blank so they need to be dropped

In [None]:
def batch_tokenize_and_embed(tweets, batch_size = 5):
    
    embeddings = torch.Tensor()
    
    for i in range(0, len(tweets), batch_size):
        batch = tweets[i : min(len(tweets), i+batch_size)]
        print("Processing chunk " + str(i) + " to " + str(i + len(batch)))
        
        tokens = tokenizer(batch, padding='max_length', max_length=130, return_tensors="pt")
        
        with torch.no_grad():
            outputs = bertweet(**tokens)
    
        embeddings = torch.cat((embeddings, outputs.pooler_output)) #pooler_output is an embedding for the entire tweet
        
    return embeddings

In [None]:
def embed_and_store(tweets, chunk_size=1000):
    
    for i in range(0, len(tweets), chunk_size):
        batch = tweets[i : min(len(tweets), i+chunk_size)]
        filename = "embeddings/all_china_embedding_" + str(i) + ".pt"
        torch.save(batch_tokenize_and_embed(batch), filename)

In [None]:
embed_and_store(list(tweets.text))