#### Get twitter sentiment data :

In [1]:
import torch

In [2]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
#from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Specifying CUDA as the device for Torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2060'

In [7]:
import pandas as pd
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = "ISO-8859-1")
# keep only 50000 rows for positive and 50000 rows for negative
neg = dataset[dataset.iloc[:,0] == 0].head(50000) 
pos = dataset[dataset.iloc[:,0] == 4].head(50000)
dataset = pd.concat([neg, pos])


### Dataset cleaning and preprocessing :

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split


In [None]:
pd.set_option('display.max_colwidth', None) # to display full text
# show only relevant data
dataset = dataset.iloc[:,[-1,0]] # text and label
# remove index column
dataset = dataset.reset_index(drop=True)
dataset.columns = ['text','sentiment']
dataset.head(3)

In [10]:
dataset.loc[:,'sentiment'].value_counts()

0    50000
4    50000
Name: sentiment, dtype: int64

In [None]:
# create sentiment and text vectors
labels = dataset.loc[:,'sentiment'].values
inputs = dataset.loc[:,'text'].values
inputs

In [None]:
import re
def preprocess_tweet(sent):
    sent = re.sub(r'@[A-Za-z0-9]+', '', sent) # remove @mentions
    sent = re.sub(r'RT[\s]+', '', sent) # remove RT(retweet)
    sent = re.sub(r'#', '', sent) # remove # symbol
    sent = re.sub(r'http\S+', '', sent) #remove URL
    sent = re.sub(r'\.{3,}', '', sent) #remove three points
    return sent

for i in range(len(inputs)):
    inputs[i] = preprocess_tweet(inputs[i])
inputs

In [13]:

# add CLS for classification and SEP for end of each sentence of each input
for i in range(len(inputs)):
    sentences = inputs[i].split(".")
    sentences[0] = "[CLS] " + sentences[0] + " [SEP]"
    for j in range(1,len(sentences)):
        sentences[j] = sentences[j] + " [SEP]"
    inputs[i] = " ".join(sentences)
   

In [None]:
inputs[1:10]

In [None]:
# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 
tokenized_texts = [tokenizer.tokenize(input) for input in inputs]
print ("Tokenize the first input:")
print (tokenized_texts[0:2])

In [16]:

# determine max lenght of input tokens
maxi = 0
for i in range(len(tokenized_texts)):
    length = len(tokenized_texts[i])
    if length > maxi:
        maxi = length
print(maxi)


105


In [None]:
# pad input to max length
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=maxi, dtype="long", truncating="post", padding="post")
print(input_ids)

In [None]:
# give attention only to token before padding
attention_masks = []
for seq in input_ids :
    mask = [float(i > 0) for i in seq]
    attention_masks.append(mask)
print(attention_masks[:2])

In [19]:
# encode sentiment values
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)
labels = labels.reshape(-1,1) 

In [20]:
# create train and test sets
train_ids, test_ids,train_labels,test_labels = train_test_split(input_ids,labels, random_state=42, test_size=0.1)
train_mask,test_mask = train_test_split(attention_masks, random_state=42, test_size=0.1)

In [21]:
print(test_labels.shape)
print(train_labels.shape)

(10000, 1)
(90000, 1)


In [None]:
# convert all data into torch tensors
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)
train_mask = torch.tensor(train_mask)
test_mask = torch.tensor(test_mask)

In [25]:
# select batch size and create dataloader
batch_size = 8

# Load data to torch dataloader to save memory during training
train_data = TensorDataset(train_ids, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(test_ids, test_mask, test_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### LOAD BERT MODEL - TRAINING LOOP :

In [None]:
try:
  import transformers
except:
  print("Installing transformers")
  !pip -qq install transformers
  
from transformers import BertModel, BertConfig,BertForSequenceClassification
# BERT configuration 
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config
print(configuration)

In [None]:
# Loading the Hugging Face Bert Uncased Base Model 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

In [28]:
#Optimizer Grouped Parameters
# Don't apply weight decay to any parameters whose names include these tokens.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# Separate the `weight` parameters from the `bias` parameters. 
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    
    # Filter for parameters which *do* include those.
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


In [29]:
# Hyperparameters for training
from transformers import get_linear_schedule_with_warmup
epochs = 4
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                  lr = 2e-5, #learning rate
                  eps = 1e-8)
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [30]:
# accuracy function of twitter sentiment analysis
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten() # 0,1,or 2
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    #  Convert to long tensor.Add batch to GPU
    
    batch = tuple(t.type(torch.LongTensor).to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
 
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs['loss']
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update the learning rate.
    scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()
  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits['logits'].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
# Training Evaluation
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

In [33]:
# SAVE model 
torch.save(model, 'TWITTER_BERT.pt')

In [34]:
# Save the model, configuration and vocabulary that we have fine-tuned
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = "./models/"

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

('./models/vocab.txt',)