# Sentiment 140 Twitter Sentiment Analysis

I will be using the sentiment140 dataset from http://help.sentiment140.com/for-students/

## Setup

Get packages

In [31]:
import torch
import numpy as np
import pandas as pd
import nltk
nltk.download('words')

from tqdm import tqdm

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [32]:
device = torch.device('cpu') if not torch.cuda.is_available() else torch.device('cuda')
print(device)

cuda


Get sentiment140 dataset from http://help.sentiment140.com/for-students/

In [None]:
# Uncomment if using colab
!pip install wget
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def download_dataset(url, rename_stem='sentiment140'):
    import wget
    import zipfile
    import os

    data_zip_path = f'{rename_stem}.zip'
    data_dir = f'{rename_stem}/'

    if not os.path.exists(data_zip_path):
        wget.download(url, data_zip_path)
    if not os.path.exists(data_dir):
        with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)

In [None]:
DATA_URL = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
download_dataset(DATA_URL)

## Data Loading and Cleaning

In [None]:
def cleaner(tweet, valid_words=set()):
     import re

     # Remove @ sign and @s
     tweet = re.sub('@[A-Za-z0-9]+', '', tweet)
     tweet = re.sub('@', '', tweet)

     # Remove http/https links
     tweet = re.sub(r'(?:\@|http?\://|https?\://|www)\S+', '', tweet)
     tweet = ' '.join(tweet.split())
     
     # Remove hashtag sign but keep the text
     tweet = tweet.replace('#', '').replace('_', ' ')

     # accept only valid words from nltk
     tweet = ' '.join(w for w in nltk.wordpunct_tokenize(tweet) \
                      if w.lower() in valid_words or not w.isalpha())
     
     tweet = tweet.lower()
     
     return tweet

In [None]:
columns = ['polarity', 'id', 'date', 'query', 'username', 'tweet']
df = pd.read_csv('./sentiment140/training.1600000.processed.noemoticon.csv',
                 encoding="ISO-8859-1", header=None, names=columns)

In [None]:
len(df.index[df['polarity'] == 4].tolist())

In [None]:
valid_words = set(nltk.corpus.words.words())
tqdm.pandas()
df['tweet'] = df['tweet'].progress_map(lambda x: cleaner(x, valid_words=valid_words))

In [None]:
# to match output convention later
df.loc[df['polarity'] == 4, 'polarity'] = 1
print(df['polarity'].unique())

In [None]:
print('duplicated:', df.duplicated().sum())
print('null:', df.notnull().sum(), sep='\n')

In [None]:
df.info()

In [None]:
df.sample(10)[['polarity', 'tweet']]

In [None]:
X = df['tweet'].values
y = df['polarity'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2023)

## BERTTokenizer + Pytorch Dataset

In [None]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('Loaded!')

Get longest tokenized sentence (since BERT requires all sentences to be the same length). This includes the special tokens `[CLS]` and `[SEP]`.

In [None]:
# max_len = 0

# # For every sentence...
# for sample in tqdm(X):

#     # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
#     input_ids = tokenizer.encode(sample, add_special_tokens=True)

#     # Update the maximum sentence length.
#     max_len = max(max_len, len(input_ids))

#     del input_ids

# print('\nMax sentence length: ', max_len)

# NOTE: the above is on CPU, so uncomment only if using different or new data
max_len = 256

Using BERT Tokenizer and maxlen to tokenize each sentence by

1. tokenize by id
2. prepend `[CLS]`
3. append `[SEP]` at end
4. pad with `[PAD]` tokens until max_len
5. attention mask for the `[PAD]` tokens

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class BertTokenizedTweetsDataset(Dataset):
    def __init__(self,
                 X_train=X_train, y_train=y_train,
                 X_test=X_test, y_test=y_test,
                 train=True, max_len=256, max_cache_size=800000):
        self.df = df
        self.train = train
        self.max_len = 256

        self.X = X_train if self.train else X_test
        self.y = y_train if self.train else y_test

        self.cache = dict()
        self.max_cache_size = max_cache_size

    def __getitem__(self, index):

        if index in self.cache.keys():
            return self.cache[index]

        X_sample = self.X[index]
        y_sample = torch.tensor(self.y[index], dtype=torch.int64)

        encoded_dict = tokenizer.encode_plus(
                        X_sample,                 # Sentence to encode.
                        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                        max_length=self.max_len,  # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length=True,
                        return_attention_mask=True,   # Construct attn. masks.
                        return_tensors='pt',      # Return pytorch tensors.
                    )
    
        X_tokenized_id = encoded_dict['input_ids']
        X_mask = encoded_dict['attention_mask']

        if len(self.cache) >= self.max_cache_size:
            self.cache.popitem()

        self.cache[index] = (X_tokenized_id, X_mask, y_sample)

        return X_tokenized_id, X_mask, y_sample

    def __len__(self):
        return len(self.X)

In [None]:
train_data = BertTokenizedTweetsDataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train=True)
test_data = BertTokenizedTweetsDataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train=False)

In [None]:
def test_cached_speed(unused_index):
    import time
    def get_time():
        stime = time.time()
        x = train_data[unused_index]
        etime = time.time()

        return etime - stime
    
    uncached = get_time()
    cached = get_time()
    speed_x = uncached / cached

    print(f'On this trial, cached had {speed_x:.2f}x speed increase')    

In [None]:
test_cached_speed(np.random.randint(0, len(train_data)-1))

## Transfer Learning w/ BERT

In [None]:
from torch.utils.data import random_split
train_subset, val_subset = random_split(train_data, [0.8, 0.2])

In [None]:
BATCH_SIZE = 32

In [None]:
train_dl = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_data, batch_size=BATCH_SIZE)

In [None]:
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

def train_step(bert, dataloader, optimizer, scheduler, epoch=0, val=False, print_batch_every=1000):

    # tracking vars
    tot_accuracy = 0
    tot_loss = 0
    batch = 0

    # ready for train
    if not val:
        bert.train()
    else:
        bert.eval()

    prepend = 'val' if val else 'train'

    for data in iter(dataloader):

        batch += 1

        # get data and send to gpu
        seqs, masks, labels = data

        # align dims
        seqs = torch.squeeze(seqs.transpose(2, 1)).to(device)
        masks = torch.squeeze(masks.transpose(2, 1)).to(device)
        labels = torch.squeeze(labels).to(device)

        if not val:
            # zero out grads
            bert.zero_grad()

        # get loss and preds
        loss, preds = bert(
            seqs,
            attention_mask=masks,
            labels=labels,
            token_type_ids=None,
            return_dict=False
        )
        
        # calc and accumulate loss
        if not val:
            loss.backward()
        tot_loss += loss.item()

        # torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0)

        if not val:
            # descent step + set lr
            optimizer.step()
            scheduler.step()

        pred_sentiment = preds.data.max(1)[1]
        correct = pred_sentiment.eq(labels).cpu().sum()
        accuracy = correct.item() / labels.size(0)
        tot_accuracy += accuracy

        if ((batch - 1) % print_batch_every == 0):
            print(f'epoch: {epoch}\tbatch: {batch}/{len(dataloader)}\t{prepend}_acc: {accuracy}\t{prepend}_loss: {loss.item()}')

    tot_accuracy = tot_accuracy / len(dataloader)
    tot_loss = tot_loss / len(dataloader)

    print(f'{prepend} epoch: {epoch}\t{prepend}_acc: {tot_accuracy}\t{prepend}_loss: {tot_loss}')
    
    return tot_accuracy, tot_loss

def save(model, optimizer, scheduler, save_path='./bert_sentiment140_fine_tuned.pth'):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }, save_path)

def load(model, optimizer, scheduler, load_path='./bert_sentiment140_fine_tuned.pth'):
    checkpoint = torch.load(load_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    return model, optimizer, scheduler

def train(bert, train_dl, val_dl, epochs=4, lr=2e-5, eps=1e-8, print_batch_every=1000, save_path='./bert_sentiment140_fine_tuned.pth'):

    optimizer = AdamW(bert.parameters(), lr=lr, eps=eps)
    
    total_steps = len(train_dl) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    train_accs, train_losses, val_accs, val_losses = [], [], [], []

    save(bert, optimizer, scheduler, save_path=save_path)

    for epoch in range(epochs):

        train_accuracy, train_loss = train_step(
            bert, train_dl,
            optimizer, scheduler,
            epoch=epoch,
            print_batch_every=print_batch_every
        )


        val_accuracy, val_loss = 0, 0
        with torch.no_grad():
            val_accuracy, val_loss = train_step(
                bert, val_dl,
                optimizer, scheduler,
                epoch=epoch,
                print_batch_every=print_batch_every,
                val=True
            )

        train_accs.append(train_accuracy)
        train_losses.append(train_loss)
        val_accs.append(val_accuracy)
        val_losses.append(val_loss)

        save(bert, optimizer, scheduler, save_path=save_path)

    return train_accs, train_losses, val_accs, val_losses

In [None]:
# load BERT model and send to gpu
bert = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(df['polarity'].unique()),
    output_attentions=False, output_hidden_states=False,
)
bert.to(device)

In [None]:
train_accs, train_losses, val_accs, val_losses = train(bert, train_dl, val_dl, print_batch_every=100)