In [8]:
import sys
sys.path.append('..')

import torch
import numpy as np
import hashlib
import torch.nn as nn
import nltk
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

from hashembed.embedding import HashEmbedding



In [10]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
num_classes = len(categories)

# constants
use_hash_embeddings = False
embedding_size = 20
num_buckets = 5000
max_words = 10**6
max_epochs = 10
num_hash_functions = 2
hidden = 50 
seed = 3
batchSize = 32 # default in keras
size_phrase = 150
isMasking = True

np.random.seed(seed)
torch.manual_seed(seed)

weights1 = [np.random.normal(scale = 0.05, size = (max_words,embedding_size))]
weights2 = [np.random.normal(scale = 0.05, size = (embedding_size,hidden)).T, np.zeros((hidden))]
weights3 = [np.random.normal(scale = 0.05, size = (hidden,num_classes)).T, np.zeros((num_classes))]

<torch._C.Generator at 0x10fb780f0>

In [15]:
def word_encoder(w, max_idx):
    # v = hash(w) #
    v = int(hashlib.sha1(w.encode('utf-8')).hexdigest(), 16)
    return (v % (max_idx-1)) + 1

def ReduceSum(x,m):
    return torch.sum(x,dim=1)

def param_from_np(array, requires_grad=True, astype=torch.FloatTensor):
    return torch.nn.Parameter(torch.from_numpy(array).type(astype),requires_grad=requires_grad)

In [16]:
class ModelSimple(nn.Module):
    def __init__(self,max_words,embedding_size,num_classes,hidden=hidden,seed=3,isMasking=isMasking,isHash=False,**kwargs):
        np.random.seed(seed)
        torch.manual_seed(seed)
        super().__init__()
        self.padding_idx = 0 if isMasking else None
        self.isHash = isHash
        if self.isHash:
            self.embedding = HashEmbedding(max_words,embedding_size,mask_zero=isMasking,
                                           num_buckets=num_buckets,seed=seed,**kwargs)
        else:
            self.embedding = nn.Embedding(max_words,embedding_size,padding_idx=self.padding_idx)
        self.reduce = ReduceSum
        
        self.output_dim = self.embedding.output_dim if isHash else embedding_size
        self.fc1 = nn.Linear(self.output_dim, hidden)
        self.a1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden, num_classes)
        
        self.reset_parameters()
        
    def reset_parameters(self):
        # Unfortunately has to set weight to 0 even when padding_idx =0
        if not self.isHash:
            if self.padding_idx is not None:
                weights1[0][0,:] = 0
            self.embedding.weight = param_from_np(weights1[0])
        if self.output_dim == 20:
            self.fc1.weight, self.fc1.bias = [param_from_np(w) for w in weights2]
            self.fc2.weight, self.fc2.bias = [param_from_np(w) for w in weights3]
        
    def forward(self,x):
        m = x.clone() 
        x = self.embedding(x)
        x = self.reduce(x,m) # should be using reduce_sum but basically a phrase is sum of rest (to accept mask)
        x = x.view(x.size(0),-1)
        x = self.fc1(x)
        x = self.a1(x)
        x = self.fc2(x)
        return x

In [17]:
def make_data(max_words, size_phrase, seed, percTrain = 0.5):
    np.random.seed(seed)
    torch.manual_seed(seed)

    # fetches the data. It contains more than 22k text with the correct labels
    twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
    # tokenizes only the first 150 tokens. THe rest is dicarded
    # no preprocessing!! should lower case and remove punctuation
    data = [nltk.word_tokenize(text)[:size_phrase] for text in twenty_train.data]
    # data = [nltk.word_tokenize(text) for text in twenty_train.data]
    # hashes all tokens
    data_encoded = [[word_encoder(w, max_words) for w in text] for text in data]
    max_len = max([len(d) for d in data])

    # pad data
    # => if sentence is not 150 char long then padds at the end with zeros
    data_encoded = [d+[0]*(max_len-len(d)) for d in data_encoded]
    
    # idx start for testing. Does a 50% split.
    idx_test = int(len(data_encoded)*percTrain)
    # puts all in a matrix with each row beeing a phrase => nCol = max_len
    data_encoded = np.vstack(data_encoded)
    # array (col vector) of targets in shape of data_encoded
    targets = np.asarray(twenty_train.target, 'int32').reshape((-1,1))
    
    #return data_encoded, targets, idx_test
    np.random.seed(seed)
    train = TensorDataset(torch.from_numpy(data_encoded[0:idx_test,:]),
                          torch.from_numpy(targets[0:idx_test,:]).type(torch.LongTensor))
    np.random.seed(seed)
    trainIter=DataLoader(dataset=train,
                         batch_size=batchSize,
                         shuffle=False)
    xTest = Variable(torch.from_numpy(data_encoded[idx_test:,:]),)
    yTest = targets[idx_test:,:]
    return trainIter, xTest, yTest

In [19]:
trainIter, xTest, yTest = make_data(max_words, size_phrase, seed) # checked output of test is same :)

In [20]:
%%time 

np.random.seed(seed)
torch.manual_seed(seed)

model = ModelSimple(max_words,embedding_size,num_classes,isHash=True,append_weight=False)
criterion=nn.CrossEntropyLoss() # Cross Entropy also computes softmax  !!!
optimizer=torch.optim.Adam(model.parameters())

np.random.seed(seed)
print('Num parameters in model: {}'.format(sum([np.prod(p.shape) for p in model.parameters()])))
print("Train on 1128 samples, validate on 1129 samples".format(trainIter.dataset.data_tensor.shape[0],xTest.shape[0]))
for epoch in range(1,1+max_epochs):
    for x,y in trainIter:
        x = Variable(x)
        y = Variable(y).squeeze(1)

        optimizer.zero_grad() # Reset gradients
        outputs = model(x)
        loss = criterion(outputs,y)
        loss.backward()
        optimizer.step()
        
  
    if True:#epoch % 5 == 1 or epoch == 1 or epoch == (max_epochs ):
        outputs = model(xTest)
        _,predicted=torch.max(outputs.data,1)
        accuracy = accuracy_score(predicted,yTest)
        print("Epoch: {}. Loss: {}. Acc: {}.".format(epoch,loss.data[0],accuracy))
        
print([p.shape for p in model.parameters()])

Num parameters in model: 2101254
Train on 1128 samples, validate on 1129 samples
Epoch: 1. Loss: 1.3405081033706665. Acc: 0.42781222320637735.
Epoch: 2. Loss: 0.9356852769851685. Acc: 0.6891054030115146.
Epoch: 3. Loss: 0.32737886905670166. Acc: 0.8246235606731621.
Epoch: 4. Loss: 0.07385215908288956. Acc: 0.9220549158547388.
Epoch: 5. Loss: 0.016816945746541023. Acc: 0.9282550930026572.
Epoch: 6. Loss: 0.0060875192284584045. Acc: 0.9388839681133747.
Epoch: 7. Loss: 0.0034391158260405064. Acc: 0.9397697077059345.
Epoch: 8. Loss: 0.0020986858289688826. Acc: 0.9433126660761736.
Epoch: 9. Loss: 0.0013713063672184944. Acc: 0.9441984056687334.
Epoch: 10. Loss: 0.0009800512343645096. Acc: 0.9459698848538529.
[torch.Size([1000000, 2]), torch.Size([5000, 20]), torch.Size([50, 20]), torch.Size([50]), torch.Size([4, 50]), torch.Size([4])]
CPU times: user 12.8 s, sys: 1.18 s, total: 14 s
Wall time: 14.5 s


In [None]:
from torch.utils.data import DataLoader, Dataset

In [35]:
import string

class Preprocessor:
    def __init__(self, remove=string.punctuation, isLowercase=True):
        self.translator = None if remove is None else str.maketrans('', '', remove)
        self.isLowercase = isLowercase
        
    def __call__(self,txt):
        txt = txt.translate(self.translator)
        if self.isLowercase:
            txt = txt.lower()
        return txt

In [36]:
p = Preprocessor()

In [37]:
p('Test1ng!')

'test1ng'

In [131]:
import os
from torch.utils.data import Dataset

class AgNews(Dataset):
    def __init__(self,
                 path,
                 maxWord=1000000,
                 maxLength=None,
                 transform=None,
                 train=True):
        r"""`AG's News <http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html>` dataset.
        
        Args:
            path (string): Root directory of dataset.
            maxWord (int,optional): max number of words if hash. If `None` will use a dictionnary.
            maxLength (int,optional): max length of a text.
            transform (callable,optional) A function/transform that takes in a text and returns a
                preprocessed version.
            train (bool, optional): If True, creates dataset from ``train.csv`` else ``test.csv``.
        """
        self.train = train
        self.path = os.path.join(path, "train.csv" if self.train else "test.csv")
        self.maxWord = maxWord
        if self.maxWord is None:
            self.vocab = dict()
        self.maxLen = float("inf") if maxLength is None else maxLength
        self.label = None
        self.data = None
        self.transform = transform
        self.load()
            
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        X = self.data[idx,:]
        y = self.label[idx]
        return X, y

    def load(self):
        with open(self.path, 'r', encoding="utf-8") as f:
            nRows = sum(1 for row in f)
            label = [None]*nRows
            data = [None]*nRows
            f.seek(0)
            reader = csv.reader(f, delimiter=',', quotechar='"')
            maxLen = 0
            for i, row in enumerate(reader):
                label[i] = int(row[0])
                txt = ' '.join(row[1:])
                if self.transform is not None:
                    txt = self.transform(txt)      
                if self.maxWord is not None:
                    data[i] = torch.Tensor([word_encoder(w,self.maxWord) for i,w in enumerate(txt.split()) if i < self.maxLen])
                else:
                    l = []
                    for counter,w in enumerate(txt.split()):
                        if counter >= self.maxLen:
                            break
                        if w not in self.vocab:
                            self.vocab[w] = len(self.vocab)+1
                        l.append(self.vocab[w])
                    data[i] = torch.Tensor(l)
                        
                maxLen = max(maxLen,data[i].shape[0])
            self.maxLen = min(self.maxLen,maxLen)
         
        self.data = torch.zeros(nRows, self.maxLen)
        for i,tokenIds in enumerate(data):
            length = tokenIds.shape[0]
            self.data[i,:length] = tokenIds
        self.label = torch.LongTensor(label).view(-1,1)

In [132]:
%%time
p = Preprocessor()
a = AgNews("../data/ag_news_csv",transform=p,maxLength=None,maxWord=None)

CPU times: user 4.99 s, sys: 124 ms, total: 5.12 s
Wall time: 5.23 s


In [127]:
%%time
p = Preprocessor()
a = AgNews("../data/ag_news_csv",transform=p)

CPU times: user 12.6 s, sys: 150 ms, total: 12.7 s
Wall time: 12.9 s


In [128]:
(a.data != 0).sum()

4489931

In [133]:
(a.data != 0).sum()

4489931

In [109]:
trainIter=DataLoader(dataset=a,
                         batch_size=batchSize,
                         shuffle=False)

In [110]:
for i,(x,y) in enumerate(trainIter):
    print(i)
    print(x)
    print(y)
    
    if i > 3:
        break

0

 7.8965e+05  9.6500e+04  8.5368e+05
 4.8642e+05  1.4549e+05  3.2434e+05
 2.6022e+05  5.8330e+04  6.1404e+05
 8.3122e+05  3.2174e+05  2.6022e+05
 2.6022e+05  7.8909e+04  5.2298e+05
 2.7182e+05  8.9554e+05  9.7930e+05
 3.8800e+05  2.2002e+05  7.1208e+05
 3.8219e+05  1.9248e+05  9.6780e+05
 6.0277e+05  4.8626e+05  6.5770e+05
 7.8965e+05  9.6500e+04  8.5368e+05
 2.6022e+05  5.8330e+04  6.1404e+05
 1.1515e+05  7.5093e+05  8.4607e+05
 2.7258e+04  7.0433e+05  7.8553e+05
 1.9609e+04  7.6322e+05  5.6654e+05
 8.2904e+05  3.0873e+05  4.8573e+05
 2.4158e+05  7.6099e+05  4.2766e+05
 7.2542e+05  7.4757e+05  8.4607e+05
 4.9571e+05  7.2505e+05  9.0580e+05
 4.5790e+05  1.3016e+05  9.4246e+05
 6.0414e+05  5.8688e+04  6.5886e+05
 1.9609e+04  7.6322e+05  6.9704e+05
 6.7354e+05  6.1404e+05  1.3432e+05
 1.7647e+05  9.7443e+05  4.9571e+05
 4.1838e+05  3.0873e+05  5.9983e+05
 3.4844e+04  7.8909e+04  9.0580e+05
 1.9356e+05  3.1731e+05  3.3819e+05
 1.9609e+04  5.6654e+05  3.1831e+05
 1.5346e+05  8.6306e+05  

In [None]:
def make_data(max_words, size_phrase, seed, percTrain = 0.5):
    np.random.seed(seed)
    torch.manual_seed(seed)

    # fetches the data. It contains more than 22k text with the correct labels
    twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
    # tokenizes only the first 150 tokens. THe rest is dicarded
    # no preprocessing!! should lower case and remove punctuation
    data = [nltk.word_tokenize(text)[:size_phrase] for text in twenty_train.data]
    # data = [nltk.word_tokenize(text) for text in twenty_train.data]
    # hashes all tokens
    data_encoded = [[word_encoder(w, max_words) for w in text] for text in data]
    max_len = max([len(d) for d in data])

    # pad data
    # => if sentence is not 150 char long then padds at the end with zeros
    data_encoded = [d+[0]*(max_len-len(d)) for d in data_encoded]
    
    # idx start for testing. Does a 50% split.
    idx_test = int(len(data_encoded)*percTrain)
    # puts all in a matrix with each row beeing a phrase => nCol = max_len
    data_encoded = np.vstack(data_encoded)
    # array (col vector) of targets in shape of data_encoded
    targets = np.asarray(twenty_train.target, 'int32').reshape((-1,1))
    
    #return data_encoded, targets, idx_test
    np.random.seed(seed)
    train = TensorDataset(torch.from_numpy(data_encoded[0:idx_test,:]),
                          torch.from_numpy(targets[0:idx_test,:]).type(torch.LongTensor))
    np.random.seed(seed)
    trainIter=DataLoader(dataset=train,
                         batch_size=batchSize,
                         shuffle=False)
    xTest = Variable(torch.from_numpy(data_encoded[idx_test:,:]),)
    yTest = targets[idx_test:,:]
    return trainIter, xTest, yTest