In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.1 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=65f1b096ac5139aca60407798d7a1353d77938e86dbf0e74dc3372ab43ae92dd
  Stored in directory: /tmp/pip-ephem-wheel-cache-_n1iyvgl/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import Iterator, BucketIterator

In [None]:
import torch
import torchtext
# from torchtext.data import Field, LabelField # For torch<=0.8.0, the importing of functions should be `from torchtext.data`
# from torchtext.data import TabularDataset
# from torchtext.data import Iterator, BucketIterator
import spacy
import en_core_web_lg
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

cpu


In [None]:
spacy_en = en_core_web_lg.load()

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

# stopwords = spacy_en.Defaults.stop_words  

In [None]:
TEXT = Field(sequential=True, tokenize=tokenize_en, lower=True, batch_first = True)
LABEL = Field(sequential=False, unk_token = None)

train, val = TabularDataset.splits(
               path="./drive/My Drive/CNN/", # the root directory where the data lies
               train='train.tsv', validation="dev.tsv", # file names
               format='tsv',
               skip_header=False, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[(None, None),('label', LABEL), ('tweet', TEXT)])
test = TabularDataset.splits(
               path="/content/drive/MyDrive/CNN/", # the root directory where the data lies
               test="tweets_test_masks.tsv", # file names
               format='tsv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[(None, None), ('tweet', TEXT), ('label', LABEL)])

In [None]:
from zipfile import ZipFile
  
# specifying the zip file name
file_name = "./drive/My Drive/CNN/emb.zip"
  
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()
  
    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
glove.twitter.27B.100d.txt                     2015-12-22 16:04:54   1021671926
glove.twitter.27B.200d.txt                     2015-12-22 16:04:54   2057595650
glove.twitter.27B.25d.txt                      2015-12-22 16:04:54    257699930
glove.twitter.27B.50d.txt                      2015-12-22 16:04:54    510889212
Extracting all the files now...
Done!


In [None]:
import torchtext.vocab as vocab
loaded_vectors = torchtext.vocab.Vectors('/content/drive/MyDrive/CNN/glove.twitter.27B.200d.txt')
TEXT.build_vocab(train, vectors=loaded_vectors, max_size=len(loaded_vectors.stoi), unk_init = torch.Tensor.normal_)
TEXT.vocab.set_vectors(stoi=loaded_vectors.stoi, vectors=loaded_vectors.vectors, dim=loaded_vectors.dim)
LABEL.build_vocab(train)
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

Vocabulary size of TEXT: 37888
Vocabulary size of LABEL: 3


In [None]:
from torchtext.legacy import data
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.tweet), 
    sort=True,
    sort_within_batch=True, 
    device = device)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return F.softmax(self.fc(cat), dim=1)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
N_FILTERS = 200
FILTER_SIZES = [3,4,5,6,7]
OUTPUT_DIM = 3
DROPOUT = 0.3
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [None]:
model

CNN(
  (embedding): Embedding(37888, 200, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 200, kernel_size=(3, 200), stride=(1, 1))
    (1): Conv2d(1, 200, kernel_size=(4, 200), stride=(1, 1))
    (2): Conv2d(1, 200, kernel_size=(5, 200), stride=(1, 1))
    (3): Conv2d(1, 200, kernel_size=(6, 200), stride=(1, 1))
    (4): Conv2d(1, 200, kernel_size=(7, 200), stride=(1, 1))
  )
  (fc): Linear(in_features=1000, out_features=3, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        batch_input, labels = batch.tweet, batch.label
        batch_input = batch_input.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_input)
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.cpu().item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            batch_input, labels = batch.tweet, batch.label
            batch_input = batch_input.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(batch_input)

            loss = criterion(outputs, labels)

            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [None]:
MAX_EPOCH = 15
for epoch in trange(MAX_EPOCH, desc="Epoch"):
    train_loss = train(model, train_iterator, optimizer, criterion)  
    val_loss, val_acc, val_f1 = evaluate(model, valid_iterator, criterion)

    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }

    torch.save(state, "./drive/My Drive/CNN/CNN_DEEP_TEXT_"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, MAX_EPOCH, train_loss, val_loss, val_acc, val_f1))

Epoch:   7%|▋         | 1/15 [01:57<27:28, 117.74s/it]


 Epoch [1/15], Train Loss: 0.7313, Validation Loss: 0.9898, Validation Accuracy: 0.5440, Validation F1: 0.4877


Epoch:  13%|█▎        | 2/15 [03:49<24:47, 114.44s/it]


 Epoch [2/15], Train Loss: 0.6794, Validation Loss: 0.9974, Validation Accuracy: 0.5299, Validation F1: 0.5144


Epoch:  20%|██        | 3/15 [05:41<22:37, 113.15s/it]


 Epoch [3/15], Train Loss: 0.6426, Validation Loss: 0.9976, Validation Accuracy: 0.5331, Validation F1: 0.5167


Epoch:  27%|██▋       | 4/15 [07:33<20:38, 112.56s/it]


 Epoch [4/15], Train Loss: 0.6189, Validation Loss: 0.9873, Validation Accuracy: 0.5489, Validation F1: 0.5341


Epoch:  33%|███▎      | 5/15 [09:24<18:42, 112.23s/it]


 Epoch [5/15], Train Loss: 0.6000, Validation Loss: 0.9984, Validation Accuracy: 0.5352, Validation F1: 0.5107


Epoch:  40%|████      | 6/15 [11:16<16:48, 112.09s/it]


 Epoch [6/15], Train Loss: 0.5884, Validation Loss: 0.9979, Validation Accuracy: 0.5363, Validation F1: 0.4993


Epoch:  47%|████▋     | 7/15 [13:08<14:55, 111.95s/it]


 Epoch [7/15], Train Loss: 0.5828, Validation Loss: 0.9931, Validation Accuracy: 0.5436, Validation F1: 0.5292


Epoch:  53%|█████▎    | 8/15 [14:57<12:57, 111.12s/it]


 Epoch [8/15], Train Loss: 0.5784, Validation Loss: 0.9971, Validation Accuracy: 0.5386, Validation F1: 0.5248


Epoch:  60%|██████    | 9/15 [16:47<11:03, 110.60s/it]


 Epoch [9/15], Train Loss: 0.5763, Validation Loss: 1.0106, Validation Accuracy: 0.5267, Validation F1: 0.5178


Epoch:  67%|██████▋   | 10/15 [18:36<09:10, 110.17s/it]


 Epoch [10/15], Train Loss: 0.5746, Validation Loss: 0.9856, Validation Accuracy: 0.5520, Validation F1: 0.5302


Epoch:  73%|███████▎  | 11/15 [20:29<07:24, 111.18s/it]


 Epoch [11/15], Train Loss: 0.5734, Validation Loss: 0.9867, Validation Accuracy: 0.5505, Validation F1: 0.5354


Epoch:  80%|████████  | 12/15 [22:19<05:32, 110.79s/it]


 Epoch [12/15], Train Loss: 0.5719, Validation Loss: 0.9880, Validation Accuracy: 0.5514, Validation F1: 0.5339


Epoch:  87%|████████▋ | 13/15 [24:09<03:40, 110.39s/it]


 Epoch [13/15], Train Loss: 0.5714, Validation Loss: 0.9863, Validation Accuracy: 0.5560, Validation F1: 0.5269


Epoch:  93%|█████████▎| 14/15 [25:58<01:50, 110.21s/it]


 Epoch [14/15], Train Loss: 0.5709, Validation Loss: 0.9962, Validation Accuracy: 0.5436, Validation F1: 0.5302


Epoch: 100%|██████████| 15/15 [27:48<00:00, 111.23s/it]


 Epoch [15/15], Train Loss: 0.5703, Validation Loss: 0.9817, Validation Accuracy: 0.5601, Validation F1: 0.5346



