In [None]:
# Base libraries
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

# Pytorch and tokenizers
import emoji
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer

# Pretty progress bar
tqdm.pandas()

In [None]:
# Load train dataset
dtypes = { 'id': int, 'timestamp': int, 'user_verified': bool, 'user_statuses_count': int, 'user_followers_count': int, 
          'user_friends_count': int, 'user_mentions': str, 'urls': str, 'hashtags': str, 'text': str }
train_df = pd.read_csv('data/train.csv')
train_df = train_df['text']

In [None]:
# Loadbertweet and tokenizer
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-covid19-base-cased', normalization=True)
bertweet = AutoModel.from_pretrained('vinai/bertweet-covid19-base-cased')
bertweet.cuda()

In [None]:
# Tokenize train tweets
train_tokenized = tokenizer(list(train_df), padding=True, return_tensors='pt', truncation=True, max_length=128)

In [None]:
# Create dataloader to feed into model
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['token_type_ids'], train_tokenized['attention_mask'])
train_dataloader = DataLoader(train_dataset, batch_size=2048)

In [None]:
train_features = torch.tensor([])

for batch in tqdm(train_dataloader):
    input_ids = batch[0].cuda()
    token_type_ids = batch[1].cuda()
    attention_mask = batch[2].cuda()

    with torch.no_grad():
        # Get bert's front-propagation output
        fts = bertweet(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        fts = fts[1].detach().cpu()

    # Concatenate batch results
    train_features = torch.cat([train_features, fts])

In [None]:
# Save results in disk
with open('data/train.pkl', 'wb') as f:
    pickle.dump(train_features, f)

In [None]:
# Load eval dataset
eval_df = pd.read_csv('data/evaluation.csv')
eval_df = train_df['text']

In [None]:
# Tokenize eval tweets
eval_tokenized = tokenizer(list(eval_df), padding=True, return_tensors='pt', truncation=True, max_length=128)

In [None]:
# Create eval dataloader
eval_dataset = TensorDataset(eval_tokenized['input_ids'], eval_tokenized['token_type_ids'], eval_tokenized['attention_mask'])
eval_dataloader = DataLoader(eval_dataset, batch_size=2048)

In [None]:
eval_features = torch.tensor([])

for batch in tqdm(eval_dataloader):
    input_ids = batch[0].cuda()
    token_type_ids = batch[1].cuda()
    attention_mask = batch[2].cuda()

    with torch.no_grad():
        # Get bert's front-propagation output
        fts = bertweet(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        fts = fts[1].detach().cpu()

    # Concatenate batch results
    eval_features = torch.cat([eval_features, fts])

In [None]:
# Save results in disk
with open('data/evaluation.pkl', 'wb') as f:
    pickle.dump(eval_features, f)