In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
from google.colab import files
files.upload()

Saving data.csv to data.csv




In [None]:
import torch
from torchtext import data, datasets
import pandas as pd
import random

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df

Unnamed: 0,text,label
0,While there is no shortage of incompetence kno...,0
1,Govt discussions in,0
2,Good news there will be no shortage of money t...,0
3,A reminder is for medical personnel Believe it...,0
4,Because shortage,0
...,...,...
10383,Hey WHO Real reason for shortage of protective...,1
10384,CoronaVirusSeattle No one should be surprised ...,1
10385,Food shortages may be coming The next shortage...,0
10386,WarnerMedia CEO Warns Of Handset Shortage But ...,0


In [None]:
TEXT = data.Field(include_lengths=True, tokenize='spacy')
LABEL = data.LabelField()

In [None]:
devset = data.TabularDataset('data.csv', format="CSV",
                             fields=(('text', TEXT),('label', LABEL)), 
                             skip_header=False)

In [None]:
devset

<torchtext.data.dataset.TabularDataset at 0x7f0730932400>

In [None]:
print(len(devset))

10389


In [None]:
print(vars(devset.examples[0]))

{'text': ['\ufefftext'], 'label': 'label'}


In [None]:
train_data, test_data = devset.split(random_state = random.seed(SEED))
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 vectors = "glove.twitter.27B.200d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.twitter.27B.zip: 1.52GB [11:44, 2.16MB/s]                            
100%|█████████▉| 1193132/1193514 [01:30<00:00, 13324.20it/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

device(type='cuda')

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.label),
    device = device)

In [None]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim,
                                      padding_idx = pad_idx)
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim,
                     num_layers = n_layers, dropout = dropout,
                     bidirectional = bidirectional)
        self.linear = nn.Linear(2 * hidden_dim, output_dim)
        self.dropout = nn.Dropout(p = dropout)
        
    def forward(self, text, text_lengths = None):
        eoutput = self.embedding(text)
        doutput = self.dropout(eoutput)
        #packed = nn.utils.rnn.pack_padded_sequence(doutput, text_lengths, enforce_sorted=False)
        output, (hc, cn) = self.lstm(doutput)
        hout = self.dropout(hc)
        h = torch.cat([hout[-2,:,:], hout[-1,:,:]], axis = 1)
            
        return self.linear(h)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT,
            PAD_IDX)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([12442, 200])


In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ..., -1.8542,  0.4022,  0.4238],
        [ 0.2078,  1.1879, -0.7320,  ...,  1.3663, -0.4598,  0.6668],
        [ 0.4935,  0.3570,  0.6607,  ...,  0.1771, -0.5369, -0.2970],
        ...,
        [ 0.3973, -0.3708, -0.0955,  ...,  0.8050, -0.6207, -0.3787],
        [-0.0106,  0.3679,  0.7847,  ...,  0.0900, -0.1726,  0.1489],
        [ 0.6009,  0.4205, -0.7113,  ...,  0.5953, -0.3073, -0.2792]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4935,  0.3570,  0.6607,  ...,  0.1771, -0.5369, -0.2970],
        ...,
        [ 0.3973, -0.3708, -0.0955,  ...,  0.8050, -0.6207, -0.3787],
        [-0.0106,  0.3679,  0.7847,  ...,  0.0900, -0.1726,  0.1489],
        [ 0.6009,  0.4205, -0.7113,  ...,  0.5953, -0.3073, -0.2792]])


In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device).float()
criterion = criterion.to(device).float()

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
import pdb
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        #pdb.set_trace()
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label.float())
        
        acc = binary_accuracy(predictions, batch.label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label.float())
            
            acc = binary_accuracy(predictions, batch.label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()



    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
path = 'model.pt'
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 0.393 | Train Acc: 86.97%
	 Val. Loss: 0.284 |  Val. Acc: 89.66%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 0.262 | Train Acc: 89.65%
	 Val. Loss: 0.266 |  Val. Acc: 90.09%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.230 | Train Acc: 91.08%
	 Val. Loss: 0.257 |  Val. Acc: 90.05%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.203 | Train Acc: 92.37%
	 Val. Loss: 0.270 |  Val. Acc: 90.18%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.180 | Train Acc: 93.03%
	 Val. Loss: 0.255 |  Val. Acc: 90.48%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.163 | Train Acc: 93.67%
	 Val. Loss: 0.250 |  Val. Acc: 90.61%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.149 | Train Acc: 94.43%
	 Val. Loss: 0.266 |  Val. Acc: 90.74%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.144 | Train Acc: 94.68%
	 Val. Loss: 0.269 |  Val. Acc: 91.17%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.125 | Train Acc: 95.46%
	 Val. Loss: 0.302 |  Val. Acc: 90.35%
Epoch: 10 | Epoch Time: 0m 1

In [None]:
s = 'While there is no shortage of incompetence knows very well what he is doing when it'

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.293 | Test Acc: 90.15%


# New Section

In [None]:
import spacy
nlp = spacy.load('en')
def predict_sentiment(model, sentence):
    try:
        model.eval()
        tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        length = [len(indexed)]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        length_tensor = torch.LongTensor(length)
        prediction = torch.sigmoid(model(tensor, length_tensor))
        return prediction.item()
    except Exception:
        return 0

In [None]:
predict_sentiment(model, 'Health care braces for shortages of supplies due to coronavirus Of particular concern is personal protective equipment such as masks respirators gloves and surgical gowns')

0.9952150583267212

In [None]:
from google.colab import files
files.upload()

In [None]:
df1 = pd.read_json('All_months_PPE.jsonl', orient='records', lines=True)

In [None]:
df1

In [None]:
df1['label'] = df1['text_processed_1'].apply(lambda x: predict_sentiment(model, x))

In [None]:
df1.to_json('results.jsonl', orient='records', lines=True)