In [1]:
import time, copy, torch, string, re, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer

from torch.utils.data import Dataset, DataLoader
from torchvision import models
from torch import nn, optim, cuda
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.autograd import Variable

In [2]:
# Note: the minimum number of articles for publication is 5214, and maximum is 11488
num_train = 800 # articles for each publication
num_valid = 100 # articles for each publication
num_test = 100 # articles for each publication
data_shuffle = False # If false, always returns the first n articles for each publication

# max_sen_len = 500

rnn_type = 'LSTM'
embed_size = 100
vocab_size = 10000
hidden_size = 50
hidden_layer = 1
fc_size = 512
dropout = 0

In [3]:
class AllTheNews(Dataset):
    def __init__(self, df_map, labels_map, words_map, start, end):

        self.df_map = df_map
        self.labels_map = labels_map
        self.words_map = words_map
        
        self.df = pd.DataFrame()
        
        for key, info in df_map.items():
            self.df = self.df.append(info[start:min(end, len(info))])

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        label, article = self.df.iloc[idx]
        article = self.tokenize(article)
        label = torch.Tensor([labels_map[label]])
        
        sample = (article, label)
        return sample
    
    def tokenize(self, content):
        article = []
        count = 0
        for word in content.split():
            if word in self.words_map:
                article.append(self.words_map[word])
        article.append(9999)
        return torch.LongTensor(article)

In [4]:
def get_maps(stop_words=None, n_gram=1):    
    df = pd.read_csv("Data/articles1.csv", usecols=['publication', 'content'])
    df = df.append(pd.read_csv("Data/articles2.csv", usecols=['publication', 'content']))
    df = df.append(pd.read_csv("Data/articles3.csv", usecols=['publication', 'content']))
    def remove_all_nonchr(s):
        return re.sub("[^a-zA-Z]", " ", s).lower()
    df['content'] = df['content'].apply(remove_all_nonchr)
    
    
    if os.path.exists('words_map.txt'):
        words_map = {}
        for l in open('words_map.txt'):
            w, i = l.strip().split(',')
            words_map[w] = int(i)
    else:
        vectorizer = CountVectorizer(
            analyzer = 'word',
            tokenizer = None,
            preprocessor = None,
            stop_words = stop_words,
            max_features = 9999,
            ngram_range = (1, n_gram),
        )
        words_matrix = vectorizer.fit_transform(df['content'])

        words = vectorizer.get_feature_names()
        count = np.squeeze(np.asarray(words_matrix.sum(0)))

        words_count = {w:c for w, c in zip(words, count)}
        words_chosen = sorted(words_count, key=words_count.get, reverse=True)[:vocab_size-1]

        words_map = {word:i+1 for i, word in enumerate(words_chosen)}
    
    # shuffle whole data set
    if data_shuffle:
        df = shuffle(df)
    df_unique = df.drop_duplicates(subset=['publication'])
    # classify into each category by publication
    labels = [name for name in df_unique['publication']]
    labels_map = {name:idx for idx, name in enumerate(sorted(labels))}
    
    df_map = {}
    for key in labels_map:
        df_map[key] = df[df['publication'].isin([key])]
    return df_map, labels_map, words_map

In [5]:
df_map, labels_map, words_map = get_maps()

In [6]:
atn_train = AllTheNews(df_map, labels_map, words_map, 0, num_train)
atn_valid = AllTheNews(df_map, labels_map, words_map, num_train, num_train+num_valid)
atn_test = AllTheNews(df_map, labels_map, words_map, num_train+num_valid, num_train+num_valid+num_test)

In [7]:
batch_sizes = {x:1 for x in ['train', 'valid', 'test']}
num_examples = {'train':len(atn_train), 'valid':len(atn_valid), 'test':len(atn_test)}

In [8]:
train_data = DataLoader(
    dataset = atn_train,
    batch_size = batch_sizes['train'],
    shuffle = True,
    num_workers = 4,
)

valid_data = DataLoader(
    dataset = atn_valid,
    batch_size = batch_sizes['valid'],
    shuffle = True,
    num_workers = 4,
)

test_data = DataLoader(
    dataset = atn_test,
    batch_size = batch_sizes['test'],
    shuffle = True,
    num_workers = 4,
)

In [9]:
datasets = {
    'train':train_data,
    'valid':valid_data,
    'test':test_data,
}

In [10]:
cuda.set_device(1)
use_gpu = cuda.is_available()
# use_gpu = False
print("GPU Availability = {c}".format(c=use_gpu))

GPU Availability = True


In [11]:
class CustomRNN(nn.Module):
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, hidden_layer, output_size, dropout=0.0):
        super(CustomRNN, self).__init__()
        
        self.rnn_type = rnn_type
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.hidden_layer = hidden_layer
        self.output_size = output_size
        self.dropout = dropout
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = getattr(nn, rnn_type)(embed_size, hidden_size, hidden_layer, bidirectional=True, dropout=dropout)
        
        self.fc1 = nn.Linear(2*hidden_size, output_size)
        

    def forward(self, x, hidden):
        batch_size = x.size(0)
        x_embed = self.encoder(x).view(x.size(1), batch_size, -1)
        out, h = self.rnn(x_embed, hidden)
        y = self.fc1(out[-1])
        y = F.log_softmax(y)

        return y, h

    def initHidden(self, batch_size):
        if self.rnn_type == 'LSTM':
            if use_gpu:
                return (Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size).cuda()),
                        Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size).cuda()))
            else:
                return (Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size)),
                        Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size)))
        elif self.rnn_type == 'GRU':
            if use_gpu:
                return Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size).cuda())
            else:
                return Variable(torch.zeros(self.hidden_layer*2, batch_size, self.hidden_size))
    

In [12]:
rnn = CustomRNN(rnn_type, vocab_size, embed_size, hidden_size, hidden_layer, len(labels_map), dropout=dropout)
if use_gpu:
    rnn = rnn.cuda()

In [13]:
def train_model(model, criterion, optimizer, num_epochs=25):    
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.
    
    loss_trace = {x:[] for x in ['train', 'valid', 'test']}
    acc_trace = {x:[] for x in ['train', 'valid', 'test']}
        
    for epoch in tqdm_notebook(range(num_epochs), desc='Total'):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)
                
        for phase in ['train', 'valid', 'test']:
            running_loss = 0.0
            running_corrects = 0.0
            
            for data in tqdm_notebook(datasets[phase], desc='{}th epoch:'.format(epoch+1), leave=False):
                inputs, labels = data
                labels = labels.long().view(-1)
                
                if use_gpu:
                    inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)
                
                hidden = model.initHidden(inputs.size(0))
                
                # zero the parameter gradients
                rnn.zero_grad()
                
                # forward
                outputs, hidden = model(inputs, hidden)
                _, preds = torch.max(outputs.data, 1)
                
#                 print(list(zip(preds.cpu().numpy(), labels.data.cpu().numpy())))
                
                loss = criterion(outputs, labels)
                if phase == 'train':
                    # backward + optimize only if in training phase
                    loss.backward()
                    optimizer.step()
                    
                # statistics
                running_loss += loss.data[0] * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / num_examples[phase]
            epoch_acc = running_corrects / num_examples[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            loss_trace[phase].append(epoch_loss)
            acc_trace[phase].append(epoch_acc)
            
            # deep copy the model
            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
                file_path = 'model_{}_{}_{}_{}'.format(
                    int(dropout*10), rnn_type, hidden_size, hidden_layer)
                if os.path.exists(file_path):
                    os.unlink(file_path)
                torch.save(model.state_dict(), file_path)
                print("Model saved as {}".format(file_path))
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val accuracy: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, loss_trace, acc_trace

In [None]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(rnn.parameters(), lr=1e-3, weight_decay=1e-5)

In [None]:
rnn, loss_trace, valid_trace = train_model(rnn, criterion, optimizer, num_epochs=25)

Epoch 1/25
----------


train Loss: 2.2645 Acc: 0.2594


valid Loss: 2.0945 Acc: 0.3113
Model saved as model_0_LSTM_50_1


test Loss: 2.0624 Acc: 0.3153

Epoch 2/25
----------


train Loss: 1.7970 Acc: 0.4214


valid Loss: 1.8934 Acc: 0.3853
Model saved as model_0_LSTM_50_1


test Loss: 1.8896 Acc: 0.4020

Epoch 3/25
----------


train Loss: 1.3899 Acc: 0.5626


valid Loss: 1.9011 Acc: 0.4260
Model saved as model_0_LSTM_50_1


test Loss: 1.9078 Acc: 0.4233

Epoch 4/25
----------


train Loss: 0.9885 Acc: 0.6906


valid Loss: 2.0471 Acc: 0.4207


test Loss: 2.0283 Acc: 0.4340

Epoch 5/25
----------


train Loss: 0.6799 Acc: 0.7921


valid Loss: 2.3745 Acc: 0.4247


test Loss: 2.3716 Acc: 0.4300

Epoch 6/25
----------


train Loss: 0.4344 Acc: 0.8710


valid Loss: 2.6491 Acc: 0.4333
Model saved as model_0_LSTM_50_1


test Loss: 2.5599 Acc: 0.4480

Epoch 7/25
----------


train Loss: 0.2950 Acc: 0.9132


valid Loss: 3.0188 Acc: 0.4040


test Loss: 2.7916 Acc: 0.4387

Epoch 8/25
----------


train Loss: 0.2115 Acc: 0.9367


valid Loss: 3.2964 Acc: 0.4167


test Loss: 3.1415 Acc: 0.4340

Epoch 9/25
----------


train Loss: 0.1678 Acc: 0.9516


valid Loss: 3.5810 Acc: 0.4080


test Loss: 3.4138 Acc: 0.4307

Epoch 10/25
----------


train Loss: 0.1699 Acc: 0.9507


valid Loss: 3.6482 Acc: 0.4213


test Loss: 3.5591 Acc: 0.4380

Epoch 11/25
----------


train Loss: 0.1484 Acc: 0.9564


valid Loss: 3.7695 Acc: 0.4180


test Loss: 3.6008 Acc: 0.4367

Epoch 12/25
----------


train Loss: 0.1422 Acc: 0.9562


valid Loss: 3.9982 Acc: 0.4187


test Loss: 3.7148 Acc: 0.4353

Epoch 13/25
----------
