In [70]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import random
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
import time
import datetime
#import logging
#import matplotlib.pyplot as plt
#import seaborn as sns
#from sklearn.model_selection import train_test_split
# keras.preprocessing.sequence import pad_sequences
#from torch.utilis.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
#import time

In [20]:
df = pd.read_csv("../data/train.csv", delimiter=",", header=0, names=["id", "keyword", "location", "text","target"])

In [24]:
df.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [25]:
print("dimensions of training data:", df.shape)

dimensions of training data: (7613, 5)


In [26]:
print("the number of sentences seen as disaster telling:", df[df.target==1].shape[0])

the number of sentences seen as disaster telling: 3271


In [47]:
print("the number of observations of keyword == naN is:",df[pd.isna(df.keyword)].shape[0])

the number of observations of keyword == naN is: 61


In [69]:
df[df.target == 1].sample(5)[['text']]

Unnamed: 0,text
6873,80 @UChicago faculty members pushing universit...
5094,Fukushima Nuclear Disaster | Increased Thyroid...
2812,DISASTER AVERTED: Police kill gunman with 'hoa...
7305,@Jennife29916207 I was thinking about you toda...
2882,We happily support mydrought a project bringi...


In [49]:
texts = df.text.values
targets = df.target.values

In [54]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 488kB/s]


In [57]:
print(texts[150])
print(tokenizer.tokenize(texts[150]))

@mickinyman @TheAtlantic That or they might be killed in an airplane accident in the night a car wreck! Politics at it's best.
['@', 'mick', '##iny', '##man', '@', 'the', '##at', '##lan', '##tic', 'that', 'or', 'they', 'might', 'be', 'killed', 'in', 'an', 'airplane', 'accident', 'in', 'the', 'night', 'a', 'car', 'wreck', '!', 'politics', 'at', 'it', "'", 's', 'best', '.']


In [58]:
max_len = 0

for text in texts:
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len,len(input_ids))
print("max length to set to is", max_len)

max length to set to is 84


In [63]:
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=84, truncation=True, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids,dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
targets = torch.tensor(targets)

In [68]:
dataset = TensorDataset(input_ids, attention_masks, targets)

train_size = int(0.9 * len(dataset))
val_size = len(dataset)-train_size

random.seed(10)
train_dataset, val_dataset = random_split(dataset,[train_size,val_size])

In [71]:

batch_size = 32

train_data_loader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size= batch_size)
validation_data_loader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

In [73]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)

Downloading: 100%|██████████| 433/433 [00:00<00:00, 170kB/s]
Downloading: 100%|██████████| 440M/440M [05:50<00:00, 1.26MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForS

In [77]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 4

total_steps = len(train_data_loader)*epochs

scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps = total_steps)

In [75]:
def flat_accuracy (preds, labels):
    pred_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)/len(labels_flat)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))

In [84]:
random.seed(32)
np.random.seed(32)
torch.manual_seed(32)


#Storing training stats
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print("================ Epoch {:} / {:} ================".format(epoch_i + 1, epochs))
    print('Training ongoing...')

    t0 = time.time()

    #reseting total loss for current epoch
    total_train_loss = 0

    #Activating train mode
    model.train

    for step, batch in enumerate(train_data_loader):
        
        # Progress visualization every 15 batches
        if step % 15 == 0 and not step == 0:
            elapsed = format_time(time.time - t0)
            print(' Batch {: >5,} of {:>,}.  Elapsed: {:}.'.format(step, len(train_data_loader), elapsed))
        
        # Unpacking batches
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        #Clear Grads
        model.zero_grad()

        #performing forward pass
        loss, loggits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask, labels= b_labels)

        #Accumulating training loss
        total_train_loss += loss.item()

        # Performing Backward pass
        loss.backward()

        # Setting the norm of gradients to 1.0.
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)

        #Updating parameters
        optimizer.step()

        #Updating the learning rate
        scheduler.step()

avg_train_loss = total_train_loss / len(train_data_loader)
training_time = format_time(time.time() - t0)

print("")
print(" Avg training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(training_time)) 




Training ongoing...


In [78]:
print(len(train_data_loader))

215


In [83]:
l0 = 0
for i,j in enumerate(train_data_loader):
    l0+=1
print(l0)

215
