In [126]:
import spacy
import nltk.data
import glob

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator,LabelField
from torchtext import data
from collections import Counter
from torch.autograd import Variable
import spacy
import numpy as np
import re
import random
import math
import time
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
authors=['Hemingway','Nietzsche','Wilde']
authors_id=[0,1,2]
PATH = "books/"
books=[]
nlp = spacy.load('en')

MIN_LEN = 5
MAX_LEN = 30

TRAIN_SIZE = 0.8
DEV_SIZE = 0.2
RANDOM_STATE = 101

In [5]:
device = torch.device('cpu')
device

device(type='cpu')

In [6]:
def file_open(path):
    #Remove special characters
    with open(path, encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'([^a-záéíóúÁÉÍÓÚA-Z0-9,. ])', '', text)
    f.close()
    return text

In [7]:
def convert_sentences(text):
    file_doc = nlp(text)
    sentences = list(file_doc.sents)
    filtered_sentences=[]
    for sent in sentences:
        if len(sent)>=MIN_LEN:
            filtered_sentences.append(sent)
    print("Total oraciones: ",len(sentences))
    print("Oraciones filtradas por longitud: ",len(filtered_sentences))
    return filtered_sentences

In [8]:
sents_by_author=[]
for author in authors:
    name_file = PATH + author
    print(name_file)
    books_author = []
    sents = []
    for file in glob.glob(name_file + "/*.txt"):
        books_author.append(file)
        txt = file_open(file)
        sents = sents+convert_sentences(txt)
    sents_by_author.append(sents)
    #print(books_author)
    books.append(books_author)

books/Hemingway
Total oraciones:  6149
Oraciones filtradas por longitud:  5512
Total oraciones:  19
Oraciones filtradas por longitud:  16
Total oraciones:  1905
Oraciones filtradas por longitud:  1828
Total oraciones:  7598
Oraciones filtradas por longitud:  6545
Total oraciones:  286
Oraciones filtradas por longitud:  272
Total oraciones:  755
Oraciones filtradas por longitud:  681
Total oraciones:  4247
Oraciones filtradas por longitud:  3873
Total oraciones:  3816
Oraciones filtradas por longitud:  3423
Total oraciones:  9976
Oraciones filtradas por longitud:  8793
books/Nietzsche
Total oraciones:  1841
Oraciones filtradas por longitud:  1556
Total oraciones:  1136
Oraciones filtradas por longitud:  1066
Total oraciones:  6272
Oraciones filtradas por longitud:  5579
Total oraciones:  1284
Oraciones filtradas por longitud:  1255
Total oraciones:  1380
Oraciones filtradas por longitud:  1288
Total oraciones:  2130
Oraciones filtradas por longitud:  1889
Total oraciones:  1436
Oracione

In [9]:
row = []
for i in range(len(authors)):
    for sent in sents_by_author[i]:
        if (len(sent)>=3):
            row.append([sent,authors[i]])

In [10]:
df= pd.DataFrame(row,columns=["text", 'class'])

In [11]:
df.head()

Unnamed: 0,text,class
0,"(IN, OUR, TIME, Chapter, I)",Hemingway
1,"(They, started, two, hours, before, daylight, ...",Hemingway
2,"(In, each, boat, ,, in, the, darkness, ,, so, ...",Hemingway
3,"(The, shooter, sat, on, a, shooting, stool, fa...",Hemingway
4,"(Somewhere, ,, in, each, boat, ,, there, was, ...",Hemingway


In [12]:
df.tail()

Unnamed: 0,text,class
61628,"(My, own, one, Chasuble, .)",Wilde
61629,"(Laetitia, Embraces, her, Miss, Prism, .)",Wilde
61630,"(At, last, Lady, Bracknell, .)",Wilde
61631,"(My, nephew, ,, you, seem, to, be, displaying,...",Wilde
61632,"(On, the, contrary, ,, Aunt, Augusta, ,, I, ve...",Wilde


In [15]:
train_df, dev_df = train_test_split(df,
                                         test_size=DEV_SIZE,
                                         train_size=TRAIN_SIZE,
                                         random_state=69,
                                         shuffle=True)

In [16]:
train_df.to_csv("data/trainset_classification.csv", index=False)
dev_df.to_csv("data/devset_classification.csv", index=False)

In [18]:
print(train_df.shape) 
print(dev_df.shape)

(49306, 2)
(12327, 2)


In [19]:
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [20]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [30]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)


In [92]:
PATH = 'data/'
train = pd.read_csv(PATH+'trainset_classification.csv')
validation = pd.read_csv(PATH+'devset_classification.csv')

In [93]:
fields = [('text',TEXT), ('label',LABEL)]


In [94]:
#loading custom dataset
training_data=data.TabularDataset(path = 'data/trainset_classification.csv',format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'text': ['Oh', ',', 'this', 'is', 'the', 'hostility', 'of', 'light', 'to', 'the', 'shining', 'one', 'unpityingly', 'doth', 'it', 'pursue', 'its', 'course', '.'], 'label': 'Nietzsche'}


In [95]:
import random
train_data, valid_data = training_data.split(split_ratio=0.1, random_state = random.seed(SEED))

In [96]:
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)  

Size of TEXT vocabulary: 2836
Size of LABEL vocabulary: 3
[(',', 5041), ('.', 4368), ('the', 4236), ('and', 2326), ('of', 2223), ('to', 1843), ('I', 1485), ('a', 1442), ('in', 1244), ('is', 973)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fe2b4731b70>>, {'<unk>': 0, '<pad>': 1, ',': 2, '.': 3, 'the': 4, 'and': 5, 'of': 6, 'to': 7, 'I': 8, 'a': 9, 'in': 10, 'is': 11, 'that': 12, 'it': 13, 'you': 14, 'was': 15, 'he': 16, ' ': 17, 'not': 18, 'with': 19, 'as': 20, 'his': 21, 'for': 22, 'said': 23, 'have': 24, 'at': 25, 'all': 26, 'The': 27, 'be': 28, 'on': 29, 'me': 30, 'had': 31, 'do': 32, 'nt': 33, 'are': 34, 'him': 35, 'my': 36, 'one': 37, 'which': 38, 'He': 39, 'they': 40, 'this': 41, 'from': 42, 'were': 43, 'so': 44, 'we': 45, 'by': 46, 'It': 47, 'them': 48, 'but': 49, 'there': 50, 'would': 51, 'will': 52, 'out': 53, 'has': 54, 'man': 55, 'an': 56, 'up': 57, 'her': 58, 'like': 59, 'But': 60, 'what': 61, 'You': 62, 'when': 63, 'who': 64, '

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 32

In [98]:
device

device(type='cuda')

In [99]:
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [132]:
class LSTMClassifier(nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(LSTMClassifier, self).__init__()


        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
        self.lstm = nn.LSTM(embedding_length, hidden_size)
        self.label = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_sentence, batch_size=None):
        
        ''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
        input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
        input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
        if batch_size is None:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
        else:
            h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
            c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
        final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)
        
        return final_output

In [101]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

In [130]:
def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        #print("label",batch.label)
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

In [111]:
def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)

In [135]:
learning_rate = 2e-5
batch_size = 32
output_size = 3
hidden_size = 256
embedding_length = 300
vocab_size = len(TEXT.vocab)
word_embeddings = TEXT.vocab.vectors

In [136]:
model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

In [146]:
for epoch in range(100):
    train_loss, train_acc = train_model(model, train_iterator, epoch)
    val_loss, val_acc = eval_model(model, valid_iterator)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
#test_loss, test_acc = eval_model(model, test_iter)
#print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

Epoch: 1, Idx: 100, Training Loss: 0.5556, Training Accuracy:  71.88%
Epoch: 01, Train Loss: 0.392, Train Acc: 83.53%, Val. Loss: 0.582974, Val. Acc: 76.61%
Epoch: 2, Idx: 100, Training Loss: 0.4323, Training Accuracy:  78.12%
Epoch: 02, Train Loss: 0.364, Train Acc: 84.86%, Val. Loss: 0.605530, Val. Acc: 75.26%
Epoch: 3, Idx: 100, Training Loss: 0.5818, Training Accuracy:  62.50%
Epoch: 03, Train Loss: 0.327, Train Acc: 86.49%, Val. Loss: 0.648895, Val. Acc: 75.82%
Epoch: 4, Idx: 100, Training Loss: 0.3782, Training Accuracy:  90.62%
Epoch: 04, Train Loss: 0.297, Train Acc: 87.68%, Val. Loss: 0.654340, Val. Acc: 76.08%
Epoch: 5, Idx: 100, Training Loss: 0.1703, Training Accuracy:  90.62%
Epoch: 05, Train Loss: 0.247, Train Acc: 89.68%, Val. Loss: 0.770465, Val. Acc: 75.78%
Epoch: 6, Idx: 100, Training Loss: 0.1246, Training Accuracy:  100.00%
Epoch: 06, Train Loss: 0.221, Train Acc: 91.19%, Val. Loss: 0.801234, Val. Acc: 76.26%
Epoch: 7, Idx: 100, Training Loss: 0.0571, Training Accur

Epoch: 53, Idx: 100, Training Loss: 0.0003, Training Accuracy:  100.00%
Epoch: 53, Train Loss: 0.025, Train Acc: 98.63%, Val. Loss: 1.879748, Val. Acc: 72.72%
Epoch: 54, Idx: 100, Training Loss: 0.1264, Training Accuracy:  96.88%
Epoch: 54, Train Loss: 0.023, Train Acc: 98.59%, Val. Loss: 1.787848, Val. Acc: 74.18%
Epoch: 55, Idx: 100, Training Loss: 0.0030, Training Accuracy:  100.00%
Epoch: 55, Train Loss: 0.028, Train Acc: 98.49%, Val. Loss: 1.899788, Val. Acc: 73.02%
Epoch: 56, Idx: 100, Training Loss: 0.0001, Training Accuracy:  100.00%
Epoch: 56, Train Loss: 0.015, Train Acc: 98.77%, Val. Loss: 1.823974, Val. Acc: 74.41%
Epoch: 57, Idx: 100, Training Loss: 0.0616, Training Accuracy:  96.88%
Epoch: 57, Train Loss: 0.025, Train Acc: 98.61%, Val. Loss: 1.874429, Val. Acc: 73.92%
Epoch: 58, Idx: 100, Training Loss: 0.0001, Training Accuracy:  100.00%
Epoch: 58, Train Loss: 0.019, Train Acc: 98.73%, Val. Loss: 1.836038, Val. Acc: 73.96%
Epoch: 59, Idx: 100, Training Loss: 0.1257, Trai

In [148]:
test_sen1 = "Thou art not stone but already hast thou become hollow by the numerous drops"
test_sen2 = "Her thin fingers tear at the jewel to no purpose." #Wilde
Text_nie = "The great disgust at man IT strangled me and had crept into my throat and what the soothsayer had presaged all is alike nothing is worth while knowledge strangleth" #Nietzsche

test_sen1 = TEXT.preprocess(test_sen1)
test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

test_sen2 = TEXT.preprocess(Text_nie)
test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]

test_sen = np.asarray(test_sen2)
test_sen = torch.LongTensor(test_sen)
test_tensor = Variable(test_sen, volatile=True)
test_tensor = test_tensor.cuda()
model.eval()
output = model(test_tensor, 1)
out = F.softmax(output, 1)
if (torch.argmax(out[0]) == 0):
    print ("Autor: Hemingway")
elif(torch.argmax(out[0]) == 1):
    print ("Autor: Nietzsche")
elif(torch.argmax(out[0]) == 2):
    print ("Autor: Wilde")   

Autor: Nietzsche


  del sys.path[0]
