In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoder,TransformerDecoderLayer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import Transformer
import pandas as pd
import torch.optim as optim
import itertools

In [2]:
from functools import lru_cache
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
import gensim.utils as utils

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
cached_lemmatize = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize)
from gensim.utils import simple_preprocess, to_unicode

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mapka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout=0.5):
        
        super(Encoder,self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear( enc_hid_dim * 2, dec_hid_dim )
        
        self.dropout = nn.Dropout( dropout )
        
    def forward(self, X):
        
        embedded = self.dropout(self.embedding(X))
        
        outputs, hidden = self.rnn(embedded)
        
        hidden = F.tanh( self.fc ( torch.cat( (hidden[-2,:,:], hidden[-1, : , : ] ), dim = 1 ) ) )
        
        return outputs, hidden


        
        

In [4]:
base_dir = "data"
train_file_X = os.path.join(base_dir,"train.source")
train_file_y = os.path.join(base_dir,"train.target")
test_file_X = os.path.join(base_dir,"test.source")
test_file_y = os.path.join(base_dir,"test.target")
val_file_X = os.path.join(base_dir,"val.source")
val_file_y = os.path.join(base_dir,"val.target")


In [5]:
import re
import string

STOP_WORDS = ["i", "a", "about", "an", "are", "as", "at", "be", "by", 
                "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", 
                "this", "to", "was", "what", "when", "where", "who", "will", "with"]

def ExpandContractions(contraction):

    contraction = re.sub(r"won\'t", "will not", contraction)
    contraction = re.sub(r"can\'t", "can not", contraction)

    contraction = re.sub(r"n\'t", " not", contraction)
    contraction = re.sub(r"\'re", " are", contraction)
    contraction = re.sub(r"\'s", " is", contraction)
    contraction = re.sub(r"\'d", " would", contraction)
    contraction = re.sub(r"\'ll", " will", contraction)
    contraction = re.sub(r"\'t", " not", contraction)
    contraction = re.sub(r"\'ve", " have", contraction)
    contraction = re.sub(r"\'m", " am", contraction)

    return contraction

def PreProcess(line):
    
    line = line.translate(str.maketrans("", "", string.punctuation))
    line = ExpandContractions(line)
    line = simple_preprocess(to_unicode(line))
    line = [cached_lemmatize(word) for word in line if word not in STOP_WORDS]

    line = " ".join(line)
    return line

In [6]:
class LineSentenceGenerator(object):

    def __init__(self, source, preprocess=None, max_sentence_length=10000, limit=None, preprocess_flag=True):
        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.input_files = []

        if preprocess != None and callable(preprocess) and preprocess_flag:
            self.preprocess = preprocess
        else:
            self.preprocess = lambda line: line.rstrip("\r\n")

        if isinstance(self.source, list):
            print('List of files given as source. Verifying entries and using.')
            self.input_files = [filename for filename in self.source if os.path.isfile(filename)]
            self.input_files.sort()  # makes sure it happens in filename order

        elif os.path.isfile(self.source):
            print('Single file given as source, rather than a list of files. Wrapping in list.')
            self.input_files = [self.source]  # force code compatibility with list of files

        elif os.path.isdir(self.source):
            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
            print('Directory of files given as source. Reading directory %s', self.source)
            self.input_files = os.listdir(self.source)
            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
            self.input_files.sort()  # makes sure it happens in filename order
        else:  # not a file or a directory, then we can't do anything with it
            raise ValueError('Input is neither a file nor a path nor a list')
        print('Files read into LineSentenceGenerator: %s' % ('\n'.join(self.input_files)))

        self.token_count = 0

    def __iter__(self):
        for file_name in self.input_files:
            print('Reading file %s', file_name)
            with open(file_name, 'rb') as fin:
                for line in itertools.islice(fin, self.limit):
                    line = self.preprocess(utils.to_unicode(line))
                    self.token_count += len(line)
                    i = 0
                    while i < len(line):
                        yield line[i:i + self.max_sentence_length]
                        i += self.max_sentence_length

    def __len__(self):
        if self.token_count > 0:
            return self.token_count
        else:
            return len(self.input_files)

    def __bool__(self):
        return self.has_data()

    def is_empty(self):
        return len(self.input_files) == 0

    def has_data(self):
        return not self.is_empty()

In [8]:
from torchtext.data import Dataset,Example
from torchtext.data import Field, BucketIterator

SRC = Field(tokenize = "spacy",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = False)

TRG = Field(tokenize = "spacy",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = False)

In [9]:
def read_data(X,y,limit=1000):
    examples = []
    fields = {'text-tokens': ('text', SRC),
              'summ-tokens': ('summ', TRG)}
    for i,x in enumerate(LineSentenceGenerator(X,PreProcess)):
        text_field = x
        if i > limit:
            break
    for i,y in enumerate(LineSentenceGenerator(y,PreProcess)):
        summ_field = y
        if i>limit:
            break
            
        e = Example.fromdict({"text-tokens": text_field, "summ-tokens": summ_field},
                             fields=fields)
        examples.append(e)
    print("examples: \n", examples[0])
    return Dataset(examples, fields=[('text', SRC), ('summ', TRG)])

In [10]:
train_data = read_data(train_file_X,train_file_y,1000)
test_data = read_data(test_file_X,test_file_y,200)
val_data = read_data(val_file_X,val_file_y,200)

Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\train.source
Reading file %s data\train.source
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\train.target
Reading file %s data\train.target
examples: 
 <torchtext.data.example.Example object at 0x000001783A12D8C8>
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\test.source
Reading file %s data\test.source
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\test.target
Reading file %s data\test.target
examples: 
 <torchtext.data.example.Example object at 0x000001783AD2A748>
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\val.source
Reading file %s data\val.source
Single file given as sour

In [11]:
print("text: ",train_data[0].text)
print("\n\nsummary: ",train_data[0].summ)

text:  ['cnn', 'bus', 'carrying', 'high', 'school', 'band', 'student', 'tipped', 'over', 'saturday', 'interstate', 'northwest', 'minneapolis', 'minnesota', 'killing', 'one', 'person', 'bus', 'carrying', 'school', 'band', 'member', 'rest', 'upright', 'after', 'crashed', 'saturday', 'minnesota', 'three', 'people', 'were', 'critically', 'injured', 'authority', 'said', 'second', 'bus', 'traveling', 'one', 'crashed', 'was', 'nt', 'affected', 'according', 'report', 'posted', 'web', 'site', 'pelican', 'rapid', 'school', 'district', 'student', 'pelican', 'rapid', 'high', 'school', 'were', 'returning', 'band', 'trip', 'chicago', 'illinois', 'accident', 'happened', 'near', 'albertville', 'minnesota', 'minnesota', 'highway', 'patrol', 'said', 'fortyeight', 'people', 'including', 'driver', 'were', 'westbound', 'bus', 'tipped', 'over', 'am', 'minnesota', 'highway', 'patrol', 'said', 'everyone', 'bus', 'taken', 'hospital', 'treatment', 'evaluation', 'school', 'district', 'said', 'watch', 'rescuer', 

In [12]:
train_data.fields

{'text': <torchtext.data.field.Field at 0x178223a1988>,
 'summ': <torchtext.data.field.Field at 0x1782243c148>}

In [13]:
print("text: ", test_data[100].text)
print("summ: ",val_data[0].summ)

text:  ['polk', 'city', 'florida', 'cnnif', 'you', 'drove', 'you', 'would', 'nt', 'even', 'know', 'it', 'there', 'ringling', 'bros', 'center', 'elephant', 'conservation', 'sits', 'acre', 'land', 'rural', 'central', 'florida', 'halfway', 'between', 'orlando', 'and', 'sarasota', 'off', 'nondescript', 'country', 'road', 'armed', 'security', 'guard', 'greets', 'you', 'entrance', 'after', 'short', 'drive', 'down', 'gravel', 'road', 'you', 'get', 'sense', 'special', 'place', 'you', 'can', 'walk', 'around', 'and', 'you', 'do', 'nt', 'hear', 'anything', 'said', 'kenneth', 'feld', 'opened', 'center', 'these', 'elephant', 'they', 'have', 'these', 'large', 'foot', 'and', 'they', 'travel', 'silently', 'through', 'field', 'think', 'it', 'very', 'peaceful', 'twentynine', 'elephant', 'currently', 'live', 'here', 'and', 'more', 'join', 'group', 'after', 'ringling', 'bros', 'decided', 'year', 'stop', 'using', 'elephant', 'it', 'traveling', 'circus', 'decision', 'our', 'family', 'had', 'discussed', 'qui

In [35]:
SRC.build_vocab(train_data.text, min_freq = 2,max_size=20000)
TRG.build_vocab(train_data.summ, min_freq = 2)

In [27]:
device = torch.device('cpu')

BATCH_SIZE = 128

train_iter = BucketIterator(train_data,BATCH_SIZE, shuffle=True,
                                                 sort_key=lambda x: len(x.text), sort_within_batch=True)

dev_iter = BucketIterator(val_data, BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True)
test_iter = BucketIterator(test_data,BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True)

In [16]:
for batch in train_iter:
#     print(batch.text.size(1))
    print(batch.summ.size(1))

128
128
128
128
105
128
128
128


In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [18]:
import math
class TransformerSummarizer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos_dropout =0.1, trans_dropout= 0.1):
        super().__init__()
        self.d_model = d_model
        self.embed_src = nn.Embedding(vocab_size, d_model)
        self.embed_tgt = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        
        src = self.pos_enc(self.embed_src(src) * math.sqrt(self.d_model))
        tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))

        output = self.transformer(src, tgt)
        
        return self.fc(output)
        
        
        

        
        

In [19]:
def generate_square_mask(self,sz):
        mask = (torch.triu(torch.ones(sz,sz)) == 1).transpose(0,1)
        mask = mask.float().masked_fill_(mask == 0,float('-inf')).masked_fill_(mask == 1,float(0.0))
        return mask

In [20]:
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [49]:
# trg = len(TRG.vocab)
# EMB_DIM = 200
SEQ_LEN =400 

D_MODEL = 512
DIM_FEEDFORWARD = 2048
VOCAB_SIZE = 20000
print(VOCAB_SIZE)
ATTENTION_HEADS = 8
N_LAYERS = 1

# vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos_dropout, trans_dropout

20000


In [50]:
model = TransformerSummarizer(VOCAB_SIZE, D_MODEL, ATTENTION_HEADS,N_LAYERS, N_LAYERS, DIM_FEEDFORWARD, SEQ_LEN).to(device)

In [38]:
optimizer = optim.Adam(model.parameters())

In [43]:
import math
import time


def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    
    print("Training......")

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):
        
        if i == 2:
            break

        src = batch.text
        trg = batch.summ

        optimizer.zero_grad()

        output = model(src.to(device), trg.to(device))

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        
        print("Training Done.....")

    return epoch_loss / len(iterator)

In [45]:
def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0
    
    print("Evaluating....")
    with torch.no_grad():

        for i, batch in enumerate(iterator):
            
            if i == 2:
                break
            src = batch.text
            trg = batch.summ

            output = model(src.to(device), trg.to(device)) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            
        print("Evaluating Done........")

    return epoch_loss / len(iterator)

In [51]:
for i,batch in enumerate(train_iter):
    if i == 1:
        break
    src = batch.text
    trg = batch.summ
    out = model(src.to(device),trg.to(device))
    print(out)

tensor([[[ 8.8929e-01, -1.3262e+00, -3.0219e-01,  ...,  2.2026e-01,
           3.6429e-01, -8.1976e-01],
         [ 4.1393e-01, -1.1168e+00, -2.8218e-01,  ...,  1.1922e-01,
          -1.8035e-01, -1.4185e-01],
         [-5.0035e-01, -5.5624e-01, -1.6688e-01,  ..., -2.7661e-01,
           2.3822e-01, -1.1350e+00],
         ...,
         [-2.3663e-01, -9.6794e-01, -2.7449e-02,  ...,  4.2126e-01,
           1.5523e-01, -6.0276e-01],
         [ 2.4584e-01, -2.8932e-01,  4.9190e-02,  ..., -1.9987e-01,
          -3.2181e-01, -1.2672e+00],
         [-1.1679e-01, -7.5195e-01, -7.7363e-02,  ...,  4.8328e-01,
           4.8548e-01, -3.3070e-01]],

        [[ 2.9838e-01, -1.5684e+00,  3.4083e-01,  ..., -7.1849e-02,
           7.8341e-02, -7.5519e-01],
         [ 4.5795e-01, -1.1151e+00,  1.2046e-01,  ...,  3.8895e-02,
           6.4708e-01, -4.9908e-01],
         [ 1.8952e-01, -1.3840e+00,  5.7832e-01,  ..., -5.2206e-01,
           2.6569e-01, -8.9652e-02],
         ...,
         [-7.7097e-01, -1

In [44]:

# Running too long
# need to fix this

def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

KeyboardInterrupt: 