In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import time
import random
import spacy
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import warnings

from torchtext.legacy import data
from torchtext.legacy import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
warnings.filterwarnings("ignore")

In [4]:
seed = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'

TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(sequential=False, use_vocab = False)

df = pd.read_csv("drive/MyDrive/ire_major/wapo.csv")
df.rename(columns={"article":"text"}, inplace = True)

df["text"].replace('', np.nan, inplace = True)
df.dropna(inplace = True)

items = df.author.value_counts().to_dict().items()
df = df[df.author.isin([key for key, val in items if val > 99])]

texts = df.text.tolist()
labels = df.author.tolist()
label2id = {i: idx for (idx, i) in enumerate(sorted(set(labels)))}
id2label = {label2id[i]: i for i in label2id}
labels = [label2id[i] for i in labels]

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.15, random_state = 42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.15, random_state = 42)

train_df_dict = {'text':train_texts, 'label':train_labels}
test_df_dict = {'text':test_texts, 'label':test_labels}
val_df_dict = {'text':val_texts, 'label':val_labels}

train_df = pd.DataFrame.from_dict(train_df_dict)
test_df = pd.DataFrame.from_dict(test_df_dict)
val_df = pd.DataFrame.from_dict(val_df_dict)

train_df.to_csv('wapo_train.csv', index =  False, header = False)
test_df.to_csv('wapo_test.csv', index = False, header = False)
val_df.to_csv('wapo_val.csv', index = False, header = False)

train_data, val_data, test_data = data.TabularDataset.splits(
    path='./', train='wapo_train.csv', validation = 'wapo_val.csv', test = 'wapo_test.csv', format='csv',
    fields=[('text', TEXT), ('label', LABEL)])

TEXT.build_vocab(train_data, vectors = "glove.6B.50d", unk_init = torch.Tensor.normal_)

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data), batch_sizes=(16, 16, 16),
    sort_key=lambda x: len(x.text), device=device)

.vector_cache/glove.6B.zip: 862MB [02:44, 5.26MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29900.04it/s]


In [5]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):        
        text = text.permute(1, 0)        
        embedded = self.embedding(text)        
        embedded = embedded.unsqueeze(1)        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
            
        return self.fc(cat)

In [6]:
INPUT_DIM = 2000
EMBEDDING_DIM = 50
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 25
DROPOUT = 0.5
PAD_IDX = 5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [7]:
print(model)

CNN(
  (embedding): Embedding(2000, 50)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 50), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 50), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=25, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [30]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.AdamW(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def compute_metrics(preds, labels):
    preds = preds.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return acc, f1, precision, recall

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0

    model.train()
    
    for batch in iterator:        
        optimizer.zero_grad()

        predictions = model(batch.text)
        loss = criterion(predictions, batch.label)
        acc, f1, precision, recall = compute_metrics(predictions.cpu(), batch.label.cpu())
        loss.backward()
        
        optimizer.step()        

        epoch_loss += loss
        epoch_acc += acc
        epoch_f1 += f1

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0

    model.eval()  
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text)
            loss = criterion(predictions, batch.label)
            acc, f1, precision, recall = compute_metrics(predictions.cpu(), batch.label.cpu())            
            epoch_loss += loss
            epoch_acc += acc
            epoch_f1 += f1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc, train_f1 = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc, valid_f1 = evaluate(model, val_iter, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, f'models/wapo_model_{epoch}')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train F1: {train_f1}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. F1: {valid_f1}' )


Epoch: 01 | Epoch Time: 0m 5s
	Train Loss: 3.130 | Train Acc: 13.71% | Train F1: 0.12509765947265952
	 Val. Loss: 2.336 |  Val. Acc: 47.34% | Val. F1: 0.4405349594547965
Epoch: 02 | Epoch Time: 0m 5s
	Train Loss: 2.109 | Train Acc: 41.42% | Train F1: 0.4012355590480593
	 Val. Loss: 1.489 |  Val. Acc: 61.41% | Val. F1: 0.6030363906926406
Epoch: 03 | Epoch Time: 0m 5s
	Train Loss: 1.577 | Train Acc: 55.25% | Train F1: 0.5447934541684541
	 Val. Loss: 1.210 |  Val. Acc: 68.75% | Val. F1: 0.6842843874875125
Epoch: 04 | Epoch Time: 0m 5s
	Train Loss: 1.309 | Train Acc: 62.71% | Train F1: 0.6250457703582698
	 Val. Loss: 1.077 |  Val. Acc: 70.31% | Val. F1: 0.6994453463203463
Epoch: 05 | Epoch Time: 0m 5s
	Train Loss: 1.133 | Train Acc: 67.01% | Train F1: 0.6658104189354187
	 Val. Loss: 1.002 |  Val. Acc: 71.56% | Val. F1: 0.7199678966866466
Epoch: 06 | Epoch Time: 0m 5s
	Train Loss: 0.979 | Train Acc: 71.72% | Train F1: 0.7141750094875093
	 Val. Loss: 0.948 |  Val. Acc: 74.84% | Val. F1: 0.75

In [33]:
test_loss, test_acc, test_f1 = evaluate(model, test_iter, criterion)

In [34]:
test_f1

0.7737744082158975

In [35]:
!mv models/enron_model_9 drive/MyDrive