In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoder,TransformerDecoderLayer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import Transformer
import pandas as pd
import torch.optim as optim
import itertools

In [2]:
from functools import lru_cache
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
import gensim.utils as utils

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
cached_lemmatize = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize)
from gensim.utils import simple_preprocess, to_unicode

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mapka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout=0.5):
        
        super(Encoder,self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear( enc_hid_dim * 2, dec_hid_dim )
        
        self.dropout = nn.Dropout( dropout )
        
    def forward(self, X):
        
        embedded = self.dropout(self.embedding(X))
        
        outputs, hidden = self.rnn(embedded)
        
        hidden = F.tanh( self.fc ( torch.cat( (hidden[-2,:,:], hidden[-1, : , : ] ), dim = 1 ) ) )
        
        return outputs, hidden


        
        

In [4]:
base_dir = "data"
train_file_X = os.path.join(base_dir,"train.source")
train_file_y = os.path.join(base_dir,"train.target")
test_file_X = os.path.join(base_dir,"test.source")
test_file_y = os.path.join(base_dir,"test.target")
val_file_X = os.path.join(base_dir,"val.source")
val_file_y = os.path.join(base_dir,"val.target")


In [5]:
import re
import string

STOP_WORDS = ["i", "a", "about", "an", "are", "as", "at", "be", "by", 
                "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", 
                "this", "to", "was", "what", "when", "where", "who", "will", "with"]

def ExpandContractions(contraction):

    contraction = re.sub(r"won\'t", "will not", contraction)
    contraction = re.sub(r"can\'t", "can not", contraction)

    contraction = re.sub(r"n\'t", " not", contraction)
    contraction = re.sub(r"\'re", " are", contraction)
    contraction = re.sub(r"\'s", " is", contraction)
    contraction = re.sub(r"\'d", " would", contraction)
    contraction = re.sub(r"\'ll", " will", contraction)
    contraction = re.sub(r"\'t", " not", contraction)
    contraction = re.sub(r"\'ve", " have", contraction)
    contraction = re.sub(r"\'m", " am", contraction)

    return contraction

def PreProcess(line):
    
    line = line.translate(str.maketrans("", "", string.punctuation))
    line = ExpandContractions(line)
    line = simple_preprocess(to_unicode(line))
    line = [cached_lemmatize(word) for word in line if word not in STOP_WORDS]

    line = " ".join(line)
    return line

In [6]:
class LineSentenceGenerator(object):

    def __init__(self, source, preprocess=None, max_sentence_length=4000, limit=None, preprocess_flag=True):
        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.input_files = []

        if preprocess != None and callable(preprocess) and preprocess_flag:
            self.preprocess = preprocess
        else:
            self.preprocess = lambda line: line.rstrip("\r\n")

        if isinstance(self.source, list):
            print('List of files given as source. Verifying entries and using.')
            self.input_files = [filename for filename in self.source if os.path.isfile(filename)]
            self.input_files.sort()  # makes sure it happens in filename order

        elif os.path.isfile(self.source):
            print('Single file given as source, rather than a list of files. Wrapping in list.')
            self.input_files = [self.source]  # force code compatibility with list of files

        elif os.path.isdir(self.source):
            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
            print('Directory of files given as source. Reading directory %s', self.source)
            self.input_files = os.listdir(self.source)
            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
            self.input_files.sort()  # makes sure it happens in filename order
        else:  # not a file or a directory, then we can't do anything with it
            raise ValueError('Input is neither a file nor a path nor a list')
        print('Files read into LineSentenceGenerator: %s' % ('\n'.join(self.input_files)))

        self.token_count = 0

    def __iter__(self):
        for file_name in self.input_files:
            print('Reading file %s', file_name)
            with open(file_name, 'rb') as fin:
                for line in itertools.islice(fin, self.limit):
                    line = self.preprocess(utils.to_unicode(line))
                    self.token_count += len(line)
                    i = 0
                    while i < len(line):
                        yield line[i:i + self.max_sentence_length]
                        i += self.max_sentence_length

    def __len__(self):
        if self.token_count > 0:
            return self.token_count
        else:
            return len(self.input_files)

    def __bool__(self):
        return self.has_data()

    def is_empty(self):
        return len(self.input_files) == 0

    def has_data(self):
        return not self.is_empty()

In [43]:
for i,line in enumerate(LineSentenceGenerator([train_file_X,train_file_y],PreProcess)):
    if i == 1:
        break
    print(len(line))
    print(line)

for i,line in enumerate(LineSentenceGenerator(train_file_y,PreProcess)):
    if i == 1:
        break
    print("\n\n",line)

List of files given as source. Verifying entries and using.
Files read into LineSentenceGenerator: data\train.source
data\train.target
Reading file %s data\train.source
3115
editor note our behind scene series cnn correspondent share their experience covering news and analyze story behind event here soledad obrien take user inside jail many inmate mentally ill inmate housed forgotten floor many mentally ill inmate housed miami before trial miami florida cnn ninth floor miamidade pretrial detention facility dubbed forgotten floor here inmate most severe mental illness incarcerated until theyre ready appear court most often they face drug charge charge assaulting officer charge judge steven leifman say usually avoidable felony he say arrest often result confrontation police mentally ill people often wont do theyre told police arrive scene confrontation seems exacerbate their illness and they become more paranoid delusional and le likely follow direction according leifman so they end up n

In [7]:
from torchtext.data import Dataset,Example
from torchtext.data import Field, BucketIterator

SRC = Field(tokenize = "spacy",
            lower = False)

TRG = Field(tokenize = "spacy",
            is_target = True,
            lower = False)

In [8]:
def read_data(X,y,limit=1000):
    examples = []
    fields = {'text-tokens': ('text', SRC),
              'summ-tokens': ('summ', TRG)}
    for i,(x,y) in enumerate(zip(LineSentenceGenerator(X,PreProcess),LineSentenceGenerator(y,PreProcess))):
        if i > limit:
            break
        text_field = x
        summ_field = y
       
        e = Example.fromdict({"text-tokens": text_field, "summ-tokens": summ_field},
                             fields=fields)
        examples.append(e)
    print("examples: \n", examples[0])
    return Dataset(examples, fields=[('text', SRC), ('summ', TRG)])

In [9]:
train_data = read_data(train_file_X,train_file_y,1000)
test_data = read_data(test_file_X,test_file_y,200)
val_data = read_data(val_file_X,val_file_y,200)

Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\train.source
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\train.target
Reading file %s data\train.source
Reading file %s data\train.target
examples: 
 <torchtext.data.example.Example object at 0x0000017190E91448>
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\test.source
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\test.target
Reading file %s data\test.source
Reading file %s data\test.target
examples: 
 <torchtext.data.example.Example object at 0x00000171A7CB2D08>
Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: data\val.source
Single file given as source, rather than a list of files.

In [10]:
print("text: ",train_data[0].text)
print("\n\nsummary: ",train_data[0].summ)

text:  ['editor', 'note', 'our', 'behind', 'scene', 'series', 'cnn', 'correspondent', 'share', 'their', 'experience', 'covering', 'news', 'and', 'analyze', 'story', 'behind', 'event', 'here', 'soledad', 'obrien', 'take', 'user', 'inside', 'jail', 'many', 'inmate', 'mentally', 'ill', 'inmate', 'housed', 'forgotten', 'floor', 'many', 'mentally', 'ill', 'inmate', 'housed', 'miami', 'before', 'trial', 'miami', 'florida', 'cnn', 'ninth', 'floor', 'miamidade', 'pretrial', 'detention', 'facility', 'dubbed', 'forgotten', 'floor', 'here', 'inmate', 'most', 'severe', 'mental', 'illness', 'incarcerated', 'until', 'they', 're', 'ready', 'appear', 'court', 'most', 'often', 'they', 'face', 'drug', 'charge', 'charge', 'assaulting', 'officer', 'charge', 'judge', 'steven', 'leifman', 'say', 'usually', 'avoidable', 'felony', 'he', 'say', 'arrest', 'often', 'result', 'confrontation', 'police', 'mentally', 'ill', 'people', 'often', 'wo', 'nt', 'do', 'they', 're', 'told', 'police', 'arrive', 'scene', 'conf

In [11]:
train_data.fields

{'text': <torchtext.data.field.Field at 0x1718dee9f88>,
 'summ': <torchtext.data.field.Field at 0x1718df43e48>}

In [12]:
print("text: ", test_data[100].text)
print("summ: ",test_data[100].summ)

text:  ['cnna', 'frenchlanguage', 'global', 'television', 'network', 'regained', 'control', 'one', 'it', 'channel', 'thursday', 'after', 'cyberattack', 'day', 'earlier', 'crippled', 'it', 'broadcast', 'and', 'social', 'medium', 'account', 'television', 'network', 'tv', 'monde', 'gradually', 'regaining', 'control', 'it', 'channel', 'and', 'social', 'medium', 'outlet', 'after', 'suffering', 'network', 'director', 'called', 'extremely', 'powerful', 'cyberattack', 'addition', 'it', 'channel', 'tv', 'monde', 'lost', 'control', 'it', 'social', 'medium', 'outlet', 'and', 'it', 'website', 'director', 'yves', 'bigot', 'said', 'video', 'message', 'posted', 'later', 'facebook', 'mobile', 'site', 'which', 'still', 'active', 'network', 'said', 'hacked', 'islamist', 'group', 'isi', 'logo', 'and', 'marking', 'appeared', 'tv', 'monde', 'social', 'medium', 'account', 'but', 'there', 'no', 'immediate', 'claim', 'responsibility', 'isi', 'any', 'other', 'group', 'day', 'broke', 'thursday', 'europe', 'netw

In [13]:
SRC.build_vocab(train_data.text, min_freq = 2,max_size=4000)
TRG.build_vocab(train_data.summ, min_freq = 2)

In [14]:
device = torch.device('cpu')

BATCH_SIZE = 128

train_iter = BucketIterator(train_data,BATCH_SIZE, shuffle=True,
                                                 sort_key=lambda x: len(x.text), sort_within_batch=True)

val_iter = BucketIterator(val_data, BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True)
test_iter = BucketIterator(test_data,BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True)

In [27]:
for batch in train_iter:
    print(batch.text.size())
    print(batch.summ.size())
    

torch.Size([320, 128])
torch.Size([48, 128])
torch.Size([398, 128])
torch.Size([44, 128])
torch.Size([691, 128])
torch.Size([43, 128])
torch.Size([1225, 105])
torch.Size([54, 105])
torch.Size([261, 128])
torch.Size([44, 128])
torch.Size([475, 128])
torch.Size([43, 128])
torch.Size([196, 128])
torch.Size([43, 128])
torch.Size([565, 128])
torch.Size([44, 128])


In [15]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [116]:
import math
class TransformerSummarizer(nn.Module):
    def __init__(self, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length,vocab_size,d_model=None, pos_dropout =0.1, trans_dropout= 0.1,embeddings=None):
        super().__init__()
       
        if embeddings is None:
            self.embed_src = nn.Embedding(vocab_size, d_model)
            self.embed_tgt = nn.Embedding(vocab_size, d_model)
        else:
            d_model = embeddings.size(1)
            self.d_model = embeddings.size(1)
            self.embed_src = nn.Embedding(*embeddings.shape)
            self.embed_src.weight = nn.Parameter(embeddings,requires_grad=False)
            
            self.embed_tgt = nn.Embedding(*embeddings.shape)
            self.embed_tgt.weight = nn.Parameter(embeddings,requires_grad=False)
        
        
        self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
        
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        
#         print("Before Embed: ",src.shape,tgt.shape,sep="\n")
        
        src = self.pos_enc(self.embed_src(src) * math.sqrt(self.d_model))
#         print(src.shape)
        tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
#         print(tgt.shape)

        output = self.transformer(src, tgt)
        
        return F.softmax(self.fc(output),dim=2)
        
        
        

        
        

In [17]:
def generate_square_mask(self,sz):
        mask = (torch.triu(torch.ones(sz,sz)) == 1).transpose(0,1)
        mask = mask.float().masked_fill_(mask == 0,float('-inf')).masked_fill_(mask == 1,float(0.0))
        return mask

In [18]:
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [60]:
# trg = len(TRG.vocab)
# EMB_DIM = 200
SEQ_LEN = 4000

D_MODEL = 200 #embedding_size
DIM_FEEDFORWARD = 200
VOCAB_SIZE = len(SRC.vocab)
print(VOCAB_SIZE)
ATTENTION_HEADS = 6
N_LAYERS = 1

# vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos_dropout, trans_dropout

4002


In [90]:
from torchtext.vocab import FastText

ff = FastText("en")

In [46]:
embeddings = ff.vectors

In [86]:
embeddings = vectors = ff.get_vecs_by_tokens(SRC.vocab.itos)

In [117]:
model = TransformerSummarizer( ATTENTION_HEADS,N_LAYERS, N_LAYERS, DIM_FEEDFORWARD, SEQ_LEN,VOCAB_SIZE,embeddings=embeddings).to(device)

In [94]:
model

TransformerSummarizer(
  (embed_src): Embedding(4002, 300)
  (embed_tgt): Embedding(4002, 300)
  (pos_enc): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=300, out_features=300, bias=True)
          )
          (linear1): Linear(in_features=300, out_features=200, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=200, out_features=300, bias=True)
          (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): Tran

In [70]:
import math
import time
from tqdm.notebook import tqdm_notebook as tqdm

def train(model: nn.Module,
          iterator: BucketIterator,
          num_batches: int,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    
    print("Training......")

    model.train()

    epoch_loss = 0

    for batch in tqdm(iterator,total=num_batches):
        
#         if i == 1:
#             break

        src = batch.text
        trg = batch.summ
        
#         tgt_inp, tgt_out = tgt[:, :-1], tgt[:, 1:]
#         tgt_mask = gen_nopeek_mask(tgt_inp.shape[1]).to('cuda')

#         trg_inp = trg[:,:-1] 

        optimizer.zero_grad()

        output = model(src.to(device), trg.to(device))

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        
    print("Training Done.....")

    return epoch_loss / len(iterator)

In [71]:
def evaluate(model: nn.Module,
             iterator: BucketIterator,
             num_batches:int,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0
    
    print("Evaluating....")
    with torch.no_grad():

        for batch in tqdm(iterator,total=num_batches):
            
#             if i == 1:
#                 break
            src = batch.text
            trg = batch.summ

            output = model(src.to(device), trg.to(device))

            

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            
        print("Evaluating Done........")

    return epoch_loss / len(iterator)

In [121]:
for i,batch in enumerate(train_iter):
    if i == 1:
        break
    src = batch.text
    trg = batch.summ
    print(src.shape)
    print(trg.shape)
#     trg_inp, trg_out = trg[:, :-1], trg[:, 1:]
#     print("trg-inp-shape: ",trg_inp.shape)
# #     print("trg-out-shape: ",trg_out.shape)
#     print("trg-inp: ",trg_inp)
#     print("trg-out: ",trg_out)
    out = model(src.to(device),trg.to(device))
    print(out[0:])
    print(out.shape)
    
    
#     del src,trg,out
#     torch.cuda.empty_cache()

# del batch

torch.Size([352, 128])
torch.Size([43, 128])
tensor([[[2.4626e-04, 3.3160e-04, 4.3547e-04,  ..., 3.9895e-04,
          1.8387e-04, 2.3146e-04],
         [3.0161e-04, 2.6841e-04, 1.7282e-04,  ..., 1.8614e-04,
          2.0611e-04, 2.2470e-04],
         [3.7614e-04, 1.4737e-04, 4.6549e-04,  ..., 1.7094e-04,
          2.1025e-04, 1.0706e-04],
         ...,
         [3.2146e-04, 1.4434e-04, 2.3186e-04,  ..., 2.5634e-04,
          5.2067e-05, 1.8164e-04],
         [4.7511e-04, 1.8989e-04, 1.6362e-04,  ..., 2.1485e-04,
          2.8546e-04, 3.2417e-04],
         [5.5858e-04, 1.1187e-04, 3.2009e-04,  ..., 1.6644e-04,
          8.1045e-05, 8.8322e-05]],

        [[2.2275e-04, 1.4432e-04, 3.3426e-04,  ..., 1.5151e-04,
          8.3821e-05, 1.6988e-04],
         [2.4563e-04, 2.6211e-04, 4.5522e-04,  ..., 1.3766e-04,
          3.6687e-04, 1.6658e-04],
         [2.1509e-04, 2.6720e-04, 1.3584e-04,  ..., 1.8691e-04,
          1.4094e-04, 1.2939e-04],
         ...,
         [5.1403e-04, 2.4544e-04, 

In [99]:
x = torch.zeros(44,127)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [95]:

# Running too long
# need to fix this

def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

parameters = filter(lambda p:p.requires_grad, model.parameters())
optimizer = optim.Adam(parameters)
num_batches = math.ceil(len(train_data)/BATCH_SIZE)
val_batches = math.ceil(len(val_data)/BATCH_SIZE)

N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, num_batches,optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iter,val_batches, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    
test_size = math.ceil(len(test_data)/BATCH_SIZE)
test_loss = evaluate(model, test_iter,test_size, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Training......


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


Training Done.....
Evaluating....


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Evaluating Done........
Epoch: 01 | Time: 1m 46s
	Train Loss: 7.365 | Train PPL: 1579.357
	 Val. Loss: 5.283 |  Val. PPL: 196.864
Evaluating....


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Evaluating Done........
| Test Loss: 5.363 | Test PPL: 213.281 |


torch.Size([4002, 300])