In [None]:
import time, copy, torch, string, re, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from collections import Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

from torch.utils.data import Dataset, DataLoader
from torchvision import models
from torch import nn, optim, cuda
from torch.optim import lr_scheduler
from torch.autograd import Variable

In [None]:
# Note: the minimum number of articles for publication is 5214, and maximum is 11488
num_train = 80 # articles for each publication
num_valid = 20 # articles for each publication
num_test = 20 # articles for each publication
data_shuffle = True # If false, always returns the first n articles for each publication

rnn_type = 'LSTM'
embed_size = 500
vocab_size = 10000
hidden_size = 1000
hidden_layer = 1
dropout = 0

In [None]:
class AllTheNews(Dataset):
    def __init__(self, df_map, labels_map, words_map, start, end):

        self.df_map = df_map
        self.labels_map = labels_map
        self.words_map = words_map
        
        self.df = pd.DataFrame()
        
        for key, info in df_map.items():
            self.df = self.df.append(info[start:min(end, len(info))])
        
        self.max_len = 0
        for article in self.df['content']:
            self.max_len = max(len(article.split()), self.max_len)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        label, article = self.df.iloc[idx]
        article = self.tokenize(article)
        label = torch.Tensor([labels_map[label]])
        
        sample = (article, label)
        return sample
    
    def tokenize(self, content):
        article = torch.zeros(self.max_len).long()
        for i, word in enumerate(content.split()):
            if word in self.words_map:
                article[i] = self.words_map[word]
        return article

In [None]:
def get_maps():    
    df = pd.read_csv("Data/articles1.csv", usecols=['publication', 'content'])
    df = df.append(pd.read_csv("Data/articles2.csv", usecols=['publication', 'content']))
    df = df.append(pd.read_csv("Data/articles3.csv", usecols=['publication', 'content']))
    def remove_all_nonchr(s):
        return re.sub("[^a-zA-Z]", " ", s).lower()
    df['content'] = df['content'].apply(remove_all_nonchr)
    
    if os.path.exists('words_map.txt'):
        words_map = {}
        for l in open('words_map.txt'):
            w, i = l.strip().split(',')
            words_map[w] = int(i)
    else:
        vectorizer = CountVectorizer(stop_words = None)
        words_matrix = vectorizer.fit_transform(df['content'])

        words = vectorizer.get_feature_names()
        count = np.squeeze(np.asarray(words_matrix.sum(0)))

        words_count = {w:c for w, c in zip(words, count)}
        words_chosen = sorted(words_count, key=words_count.get, reverse=True)[:vocab_size-1]

        words_map = {word:i+1 for i, word in enumerate(words_chosen)}
    
    # shuffle whole data set
    if data_shuffle:
        df = shuffle(df)
    df_unique = df.drop_duplicates(subset=['publication'])
    # classify into each category by publication
    labels = [name for name in df_unique['publication']]
    labels_map = {name:idx for idx, name in enumerate(sorted(labels))}
    
    df_map = {}
    for key in labels_map:
        df_map[key] = df[df['publication'].isin([key])]
    return df_map, labels_map, words_map

In [None]:
df_map, labels_map, words_map = get_maps()

In [None]:
atn_train = AllTheNews(df_map, labels_map, words_map, 0, num_train)
atn_valid = AllTheNews(df_map, labels_map, words_map, num_train, num_train+num_valid)
atn_test = AllTheNews(df_map, labels_map, words_map, num_train+num_valid, num_train+num_valid+num_test)

In [None]:
batch_sizes = {x:16 for x in ['train', 'valid', 'test']}
num_examples = {'train':len(atn_train), 'valid':len(atn_valid), 'test':len(atn_test)}

In [None]:
train_data = DataLoader(
    dataset = atn_train,
    batch_size = batch_sizes['train'],
    shuffle = True,
    num_workers = 4,
    pin_memory = True
)

valid_data = DataLoader(
    dataset = atn_valid,
    batch_size = batch_sizes['valid'],
    shuffle = True,
    num_workers = 4,
    pin_memory = True
)

test_data = DataLoader(
    dataset = atn_test,
    batch_size = batch_sizes['test'],
    shuffle = True,
    num_workers = 4,
    pin_memory = True
)

In [None]:
datasets = {
    'train':train_data,
    'valid':valid_data,
    'test':test_data,
}

In [None]:
cuda.set_device(0)
use_gpu = cuda.is_available()
print("GPU Availability = {c}".format(c=use_gpu))

In [None]:
class CustomRNN(nn.Module):
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, hidden_layer, output_size, dropout=0.0):
        super(CustomRNN, self).__init__()
        
        self.rnn_type = rnn_type
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.hidden_layer = hidden_layer
        self.output_size = output_size
        self.dropout = dropout
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = getattr(nn, rnn_type)(embed_size, hidden_size, hidden_layer, dropout=dropout)
#         self.bn = nn.BatchNorm1d(hidden_size)
        self.fc1 = nn.Linear(hidden_size, output_size)
        

    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        x_embed = self.encoder(x).view(x.size(1), batch_size, -1)

        output, hidden = self.rnn(x_embed, hidden)
        output = self.fc1(output[-1])

        return output, hidden

    def initHidden(self, batch_size):
        if self.rnn_type == 'LSTM':
            return (Variable(torch.zeros(self.hidden_layer, batch_size, self.hidden_size).cuda()),
                    Variable(torch.zeros(self.hidden_layer, batch_size, self.hidden_size).cuda()))
        elif self.rnn_type == 'GRU':
            return Variable(torch.zeros(self.hidden_layer, batch_size, self.hidden_size).cuda())

In [None]:
rnn = CustomRNN(rnn_type, len(words_map), embed_size, hidden_size, hidden_layer, len(labels_map), dropout=dropout)
rnn = rnn.cuda() if use_gpu else rnn

In [None]:
def train_model(model, criterion, optimizer, num_epochs=25):    
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100.0
    
    loss_trace = {x:[] for x in ['train', 'valid']}
    acc_trace = {x:[] for x in ['train', 'valid']}
        
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)
                
        for phase in ['train', 'valid', 'test']:
            running_loss = 0.0
            running_corrects = 0.0
            
            for data in datasets[phase]:
                inputs, labels = data
                labels = labels.long().view(-1)
                
                if use_gpu:
                    inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)
                
                hidden = model.initHidden(batch_sizes[phase])
                
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                outputs, hidden = model(inputs, hidden)
                _, preds = torch.max(outputs.data, 1)
                
                loss = criterion(outputs, labels)
                if phase == 'train':
                    # backward + optimize only if in training phase
                    loss.backward()
                    optimizer.step()
                
                # statistics
                running_loss += loss.data[0] * batch_sizes[phase]
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / num_examples[phase]
            epoch_acc = running_corrects / num_examples[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            loss_trace[phase].append(epoch_loss)
            acc_trace[phase].append(epoch_acc)
            
            # deep copy the model
            if phase == 'valid' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                
                file_path = 'model_{}_{}_{}_{}_{}'.format(
                    int(dropout*10), rnn_type, hidden_size, hidden_layer, optimization[:3])
                if os.path.exists(file_path):
                    os.unlink(file_path)
                torch.save(model.state_dict(), file_path)
                print("Model saved as {}".format(file_path))
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Loss: {:4f}'.format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, loss_trace, acc_trace

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.0001)

In [None]:
rnn, loss_trace, valid_trace = train_model(rnn, criterion, optimizer, num_epochs=25)