# FISP Twitter Projects Notebook

This notebook contains the latest code for pulling tweets and cleaning, manipilating the dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# install
!pip install pytorch-pretrained-bert
!pip install transformers

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from pytorch_pretrained_bert import BertAdam
from transformers import BertTokenizer, get_linear_schedule_with_warmup, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import datetime
import random
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 22.4MB/s eta 0:00:01[K     |█████▎                          | 20kB 2.2MB/s eta 0:00:01[K     |████████                        | 30kB 2.9MB/s eta 0:00:01[K     |██████████▋                     | 40kB 2.1MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.4MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.8MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 3.0MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 3.2MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 3.6MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 3.4MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 3.4MB/s eta 0:00:01[K     |██████████████████████

Using TensorFlow backend.


Loading BERT tokenizer...


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
decoder_dict = {"‚Äù": '"',
                "‚Äú": '"',
                "‚Ä¶": '...',
                "‚Äô": "'",
                "‚Äò": "'",
                "‚Äì": "-",
                "‚Äî": "-"}
def cleaner(txt):
    """
    Custom cleaner of text
    """
    txt = txt.split()
    for i in range(len(txt)):
        word = txt[i]
        while 'Ä' in word:
            char_i = word.index('Ä')
            to_decode = word[char_i - 1: char_i + 2]
            if to_decode in decoder_dict:
                word = word[:char_i - 1] + decoder_dict[to_decode]\
                        + word[char_i + 2:]
            else:
                word = word[:char_i - 1] + " " + word[char_i + 2:]
        if word in ['t', 's', 'd', 'll', 've', 're', 'm']:
            txt[i] = txt[i - 1] + "'" + word
            txt[i - 1] = ''
            continue
        link_pattern = re.compile('htt.*[^\s]*')
        word = link_pattern.sub(' ', word)
        amp_pattern = re.compile('&amp;')
        word = amp_pattern.sub('and', word)
        alpha_num_pattern = re.compile("[^a-zA-Z0-9_.,:']+")
        word = alpha_num_pattern.sub(' ', word)
        txt[i] = word.strip()
    txt = ' '.join(txt).strip()
    if txt == '...' or txt == '':
        return np.NaN
    return txt

def bert_tokenize_f(sent, tokenizer):
    """
    Tokenize all of the sentences and map the tokens to thier word IDs.
        (1) Tokenize the sentence.
        (2) Prepend the `[CLS]` token to the start.
        (3) Append the `[SEP]` token to the end.
        (4) Map tokens to their IDs.
    """
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    return encoded_sent
  
def pad_sequences_f(token_arr, maxlen, dtype="long",
                    value=0, truncating="post", padding="post"):
    return pad_sequences([token_arr], maxlen=maxlen, dtype=dtype, 
                         value=value, truncating=truncating,
                         padding=padding)[0]

In [0]:
data_dir = 'drive/My Drive/tweets_data_collection/'

prev_data_tweets = 'training_data_prezthenstate.csv'
new_data_tweets = 'CodedStateTweetsMarch2020.csv'

In [0]:
prev_cols = ['text', 'factual_claim', 'sentiment', 'ideology', 'political',
            'immigration', 'macroeconomics', 'national_security',
            'crime_law_enforcement', 'civil_rights', 'environment', 'education',
            'healthcare', 'no_policy_content', 'asks_for_donation',
            'ask_to_watch_read_share_follow_s', 'ask_misc', 'governance', 'id']

rename_to_old = {'ask_to_watch_read_share_follow_s': 'ask_to_etc',
                 'macroeconomics': 'macroeconomic',
                 'crime_law_enforcement': 'crime',
                 'healthcare': 'health_care',
                 'id': 'index'}

prev_data_df = pd.read_csv(data_dir + prev_data_tweets)
prev_data_df = prev_data_df.drop(['state', 'Unnamed: 21'], axis=1)

new_data_df = pd.read_csv(data_dir + new_data_tweets)[prev_cols]
new_data_df = new_data_df.rename(columns=rename_to_old)
new_data_df['opinion'] = new_data_df['factual_claim'].apply(lambda x: 1 if not x else 0)

data_df = pd.concat([prev_data_df, new_data_df], ignore_index=True, sort=False)
data_df = data_df.drop_duplicates()

data_df['index'] = data_df['index'].fillna(0)
data_df['sentiment'] = data_df['sentiment'].fillna(0).apply(lambda x: 2 if x == -1 else x)
data_df['political'] = data_df['political'].fillna(0).apply(lambda x: 2 if x == -1 else x)
data_df['ideology'] = data_df['ideology'].fillna(0).apply(lambda x: 2 if x == -1 else x)

In [7]:
policy_list = ['immigration', 'macroeconomic', 'national_security', 'crime',
               'civil_rights', 'environment', 'education', 'health_care',
               'governance', 'no_policy_content']
ask_list = ['no_ask', 'asks_for_donation', 'ask_to_etc', 'ask_misc']
data_df = data_df.apply(lambda x: x if sum([x[policy] for policy in policy_list]) else np.NaN, axis=1)
data_df = data_df.dropna()
data_df = data_df[data_df['opinion'] != data_df['factual_claim']]
data_df = data_df.drop(data_df[data_df['asks_for_donation'] == 1][data_df['ask_to_etc'] == 1].index)
data_df = data_df.drop(data_df[data_df['ask_misc'] == 1][data_df['ask_to_etc'] == 1].index)
data_df = data_df.drop(data_df[data_df['asks_for_donation'] == 1][data_df['ask_misc'] == 1].index)
data_df['no_ask'] = data_df.apply(lambda x: 1 if not x['asks_for_donation']\
                                  and not x['ask_to_etc'] and not x['ask_misc'] else 0, axis=1)

  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [0]:
def one_hot_decoder(row, one_hot_cols):
    for i in range(len(one_hot_cols)):
        if row[one_hot_cols[i]] == 1:
            return i
data_df['policies'] = data_df[policy_list].apply(one_hot_decoder,
                                                 args=(policy_list,), axis=1)
data_df['ask_requests'] = data_df[ask_list].apply(one_hot_decoder,
                                                 args=(ask_list,), axis=1)

In [9]:
print('Pre-cleaning:')
print(data_df['text'].tail(10))
data_df['cleaned_text'] = data_df['text'].apply(cleaner)
print('Post-cleaning:')
print(data_df['cleaned_text'].tail(10))
data_df = data_df.dropna()

Pre-cleaning:
11991    Congratulations Lonnie Peppler-Moyer on 22 yea...
11992    #GatorNation is everywhere. Including Clay Cou...
11993                                       @mikevorel *if
11994    Costliest fires in California history now 100%...
11995    RT @NEENZ: You're invited to the 2011 Clean En...
11996    RT @pivotok: We are so thankful to Sen. @steph...
11997    Today, I would like to share the first in a se...
11999                            … https://t.co/i27ojhuphW
12000    Of course I don't think poorly of educators bu...
12001    ...another costly waste of taxpayer money and ...
Name: text, dtype: object
Post-cleaning:
11991    Congratulations Lonnie Peppler Moyer on 22 yea...
11992     GatorNation is everywhere. Including Clay County
11993                                         mikevorel if
11994    Costliest fires in California history now 100 ...
11995    RT NEENZ: You're invited to the 2011 Clean Ene...
11996    RT pivotok: We are so thankful to Sen. stephan...
1

In [10]:
print('Pre-tokenize:')
print(data_df['cleaned_text'].tail(10))
data_df['tokenized_text'] = data_df['cleaned_text'].apply(bert_tokenize_f, args=(bert_tokenizer,))
print('Post-tokenize:')
print(data_df['tokenized_text'].tail(10))

Pre-tokenize:
11990    rzcc2578 JamesRKernIII It's the vile automated...
11991    Congratulations Lonnie Peppler Moyer on 22 yea...
11992     GatorNation is everywhere. Including Clay County
11993                                         mikevorel if
11994    Costliest fires in California history now 100 ...
11995    RT NEENZ: You're invited to the 2011 Clean Ene...
11996    RT pivotok: We are so thankful to Sen. stephan...
11997    Today, I would like to share the first in a se...
12000    Of course I don't think poorly of educators bu...
12001    ...another costly waste of taxpayer money and ...
Name: cleaned_text, dtype: object
Post-tokenize:
11990    [101, 1054, 2480, 9468, 17788, 2581, 2620, 250...
11991    [101, 23156, 8840, 23500, 2063, 27233, 10814, ...
11992    [101, 11721, 4263, 9323, 2003, 7249, 1012, 216...
11993                  [101, 3505, 14550, 2884, 2065, 102]
11994    [101, 3465, 21292, 8769, 1999, 2662, 2381, 208...
11995    [101, 19387, 7663, 14191, 1024, 2017, 1005,

In [11]:
print('Max sentence length: ', max(data_df['tokenized_text'].apply(len)))

Max sentence length:  48


In [12]:
# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 48...
MAX_LEN = 280

print('Padding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(bert_tokenizer.pad_token,
                                               bert_tokenizer.pad_token_id))

print('Pre-padding:')
print(data_df['tokenized_text'].tail(10))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
data_df['padded_tokenized_text'] = \
    data_df['tokenized_text'].apply(pad_sequences_f, args=(MAX_LEN,))

print('Post-padding:')
print(data_df['padded_tokenized_text'].tail(10))

Padding/truncating all sentences to 280 values...

Padding token: "[PAD]", ID: 0
Pre-padding:
11990    [101, 1054, 2480, 9468, 17788, 2581, 2620, 250...
11991    [101, 23156, 8840, 23500, 2063, 27233, 10814, ...
11992    [101, 11721, 4263, 9323, 2003, 7249, 1012, 216...
11993                  [101, 3505, 14550, 2884, 2065, 102]
11994    [101, 3465, 21292, 8769, 1999, 2662, 2381, 208...
11995    [101, 19387, 7663, 14191, 1024, 2017, 1005, 21...
11996    [101, 19387, 14255, 22994, 6559, 1024, 2057, 2...
11997    [101, 2651, 1010, 1045, 2052, 2066, 2000, 3745...
12000    [101, 1997, 2607, 1045, 2123, 1005, 1056, 2228...
12001    [101, 1012, 1012, 1012, 2178, 17047, 5949, 199...
Name: tokenized_text, dtype: object
Post-padding:
11990    [101, 1054, 2480, 9468, 17788, 2581, 2620, 250...
11991    [101, 23156, 8840, 23500, 2063, 27233, 10814, ...
11992    [101, 11721, 4263, 9323, 2003, 7249, 1012, 216...
11993    [101, 3505, 14550, 2884, 2065, 102, 0, 0, 0, 0...
11994    [101, 3465, 21292, 87

In [13]:
# Create attention masks
data_df['att_masks'] = data_df['padded_tokenized_text']\
                          .apply(lambda x: [int(token_id > 0) for token_id in x])
print('Attention masks')
data_df['att_masks'].tail(10)

Attention masks


11990    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
11991    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
11992    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...
11993    [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
11994    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
11995    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
11996    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
11997    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
12000    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
12001    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: att_masks, dtype: object

In [0]:
y_cols = ['sentiment', 'political', 'ideology', 'policies', 'ask_requests',
          'factual_claim'] + policy_list + ask_list
X_inputs = data_df['padded_tokenized_text']
X_att_masks = data_df['att_masks']
y = data_df[y_cols].astype(int)

In [0]:
# Use 80% for training and 20% for test.
X_train_inputs, X_test_inputs, y_train_labels, y_test_labels = train_test_split(X_inputs, y, 
                                                            random_state=seed_val,
                                                            test_size=0.2)
# Do the same for the masks.
X_train_masks, X_test_masks, _, _ = train_test_split(X_att_masks, y,
                                             random_state=seed_val, test_size=0.2)

In [16]:
output_dir = data_dir + 'model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

bert_tokenizer.save_pretrained(output_dir)

('drive/My Drive/tweets_data_collection/model_save/vocab.txt',
 'drive/My Drive/tweets_data_collection/model_save/special_tokens_map.json',
 'drive/My Drive/tweets_data_collection/model_save/added_tokens.json')

In [17]:
print(X_train_inputs.shape)
print(X_test_inputs.shape)

(9167,)
(2292,)


In [0]:
def get_classification_report_improper(pred_df):
    pred_series = pred_df.apply(lambda x:
                                [pred.replace('[', '').replace(']', '').split()
                                  for pred in x[0].split(']\n')],
                                axis=1)
    pred_flatten = []
    for lst in pred_series:
        pred_flatten.extend(lst)
    pred_series = pd.DataFrame(pred_flatten).apply(np.argmax, axis=1)[0]
    print('========================================')
    print(' Classification Report for', curr_y_col)
    print('========================================')
    print(classification_report(y_test_labels[curr_y_col], pred_series))

def get_classification_report(pred_df):
    pred_series = pred_df.idxmax(axis=1)
    print('========================================')
    print(' Classification Report for', curr_y_col)
    print('========================================')
    print(classification_report(y_test_labels[curr_y_col], pred_series))

In [0]:
for curr_y_col in y_cols:
    pred_df = pd.read_csv(output_dir + curr_y_col + '/predictions.csv', index_col=0)
    pred_series = pred_df.apply(lambda x:
                                [pred.replace('[', '').replace(']', '').split()
                                  for pred in x[0].split(']\n')],
                                axis=1)
    pred_flatten = []
    for lst in pred_series:
        pred_flatten.extend(lst)
    pred_series = pd.DataFrame(pred_flatten).apply(np.argmax, axis=1)[0]
    print('========================================')
    print(' Classification Report for', curr_y_col)
    print('========================================')
    print(classification_report(y_test_labels[curr_y_col], pred_series))

In [19]:
for curr_y_col in y_cols:
    curr_y_train_labels, curr_y_test_labels =\
        y_train_labels[curr_y_col], y_test_labels[curr_y_col]
    
    # Convert all inputs and labels into torch tensors, the required datatype 
    # for our model.
    X_train_inputs_tensor = torch.tensor(X_train_inputs.to_list())
    X_test_inputs_tensor = torch.tensor(X_test_inputs.to_list())

    curr_y_train_labels_tensor = torch.tensor(curr_y_train_labels.to_list())
    curr_y_test_labels_tensor = torch.tensor(curr_y_test_labels.to_list())

    X_train_masks_tensor = torch.tensor(X_train_masks.to_list())
    X_test_masks_tensor = torch.tensor(X_test_masks.to_list())
    
    # The DataLoader needs to know our batch size for training, so we specify it 
    # here.
    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
    # 16 or 32.

    batch_size = 32

    # Create the DataLoader for our training set.
    train_data = TensorDataset(X_train_inputs_tensor, X_train_masks_tensor, curr_y_train_labels_tensor)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our test set.
    test_data = TensorDataset(X_test_inputs_tensor, X_test_masks_tensor, curr_y_test_labels_tensor)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    # Load BertForSequenceClassification, the pretrained BERT model with a single 
    # linear classification layer on top. 
    model = BertForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path = "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = y[curr_y_col].nunique(), # The number of output labels--3
        # output_attentions = False, # Whether the model returns attentions weights.
        # output_hidden_states = False, # Whether the model returns all hidden-states.
    )

    # Tell pytorch to run this model on the GPU.
    model.cuda()

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = BertAdam(model.parameters(),
                      lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                    )

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = 4

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # For each epoch...
    for epoch_i in range(epochs):

        # Store predictions after epochs
        predictions = []
        
        print('========================================')
        print('          Training for', curr_y_col)
        print('========================================')
        
        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0

        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            
            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
            
        # ========================================
        #               Test
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our test set.

        print("")
        print("Running test eval...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        eval_loss, eval_f1_score = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in test_dataloader:
            
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            
            # Telling the model not to compute or store gradients, saving memory and
            # speeding up test
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have
                # not provided labels.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)
                
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            # Calculate the classification report for this batch of test sentences.
            tmp_f1_score = f1_score(np.argmax(logits, axis=1).flatten(),
                                    label_ids.flatten(), average='weighted')
            
            # Accumulate the f1_score
            eval_f1_score += tmp_f1_score

            # Track the number of batches
            nb_eval_steps += 1

            predictions.append(logits)

        # Report the final accuracy for this test run.
        print("  F1-Score: {0:.2f}".format(eval_f1_score/nb_eval_steps))
        pred_df = pd.DataFrame(np.concatenate(predictions))
        get_classification_report(pred_df)
        print("  Test took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Training complete for", curr_y_col)

    curr_output_dir = output_dir + curr_y_col + "/"
    # Create output directory if needed
    if not os.path.exists(curr_output_dir):
        os.makedirs(curr_output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(curr_output_dir)
    model.save_pretrained(curr_output_dir)
    pred_df.to_csv(curr_output_dir + 'predictions.csv')

    # Good practice: save your training arguments together with the trained model
    # torch.save(args, os.path.join(curr_output_dir, 'training_args.bin'))

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




t_total value of -1 results in schedule not being applied


          Training for sentiment

Training...


RuntimeError: ignored

In [0]:
desired_y_col = 'sentiment'
y_des_train_labels, y_des_test_labels =\
    y_train_labels[desired_y_col], y_test_labels[desired_y_col]

In [0]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
X_train_inputs = torch.tensor(X_train_inputs.to_list())
X_test_inputs = torch.tensor(X_test_inputs.to_list())

y_des_train_labels = torch.tensor(y_des_train_labels.to_list())
y_des_test_labels = torch.tensor(y_des_test_labels.to_list())

X_train_masks = torch.tensor(X_train_masks.to_list())
X_test_masks = torch.tensor(X_test_masks.to_list())

In [0]:
print(X_train_inputs.shape)
print(y_des_train_labels.shape)
print(X_train_masks.shape)

torch.Size([7863, 64])
torch.Size([7863])
torch.Size([7863, 64])


In [0]:
 # The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(X_train_inputs, X_train_masks, y_des_train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(X_test_inputs, X_test_masks, y_des_test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [0]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = y[desired_y_col].nunique(), # The number of output labels--3
    # output_attentions = False, # Whether the model returns attentions weights.
    # output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

 34%|███▍      | 140331008/407873900 [00:05<00:09, 29629428.10B/s]

In [0]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [0]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = BertAdam(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [0]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)