In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoder,TransformerDecoderLayer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import Transformer
import pandas as pd
import torch.optim as optim


In [2]:
from functools import lru_cache
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
cached_lemmatize = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize)
from gensim.utils import simple_preprocess, to_unicode

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mapka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout=0.5):
        
        super(Encoder,self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear( enc_hid_dim * 2, dec_hid_dim )
        
        self.dropout = nn.Dropout( dropout )
        
    def forward(self, X):
        
        embedded = self.dropout(self.embedding(X))
        
        outputs, hidden = self.rnn(embedded)
        
        hidden = F.tanh( self.fc ( torch.cat( (hidden[-2,:,:], hidden[-1, : , : ] ), dim = 1 ) ) )
        
        return outputs, hidden


        
        

In [4]:
# class TransformerSummarizer(nn.Module):
#     """
#     Transformer Encoder with self attention layers.
#     """
#     def __init__(self,max_seq_len,ntoken,ninp,nhead,nhid,nlayers,dropout=0.5):
#         """
#         :param max_seq_len : maximum sequence length
#         :param ntoken: size of vocab
#         :param ninp
#         :param nhead
#         :param nhid
#         :param nlayers
#         :param dropout: 0.5 by default
#         """
#         super(TransformerSummarizer,self).__init__()
#         self.model_type = 'Summarizer'
#         self.src_mask = None
#         self.pos_encoder = PositionalEncoding(ninp,dropout)
#         encoder_layers = TransformerEncoderLayer(ninp ,nhead ,nhid , dropout)
#         self.transformer_encoder = TransformerEncoder(encoder_layers,nlayers)
#         self.encoder = nn.Embedding(ntoken, ninp)
#         self.ninp = ninp
#         decoder_layers = TransformerDecoderLayer(ninp, nhead, nhid,dropout) 
#         self.decoder = TransformerDecoder(decoder_layers, nlayers)
        
#         self.init_weights()

        
        
#     def generate_square_mask(self,sz):
#         mask = (torch.triu(torch.ones(sz,sz)) == 1).transpose(0,1)
#         mask = mask.float().masked_fill_(mask == 0,float('-inf')).masked_fill_(mask == 1,float(0.0))
#         return mask


#     def init_weights(self):
#         initrange = 0.1
#         self.encoder.weight.data.uniform_(-initrange,initrange)
#         self.decoder.bias.data.zero_()
#         self.decoder.weight.data.uniform_(-initrange,initrange)

#     def forward(self,src):
#         if self.src_mask is None or self.src_mask.size(0)!=len(src):
#             device = src.device
#             mask = self.generate_square_mask(len(src)).to(device)
#             self.src_mask = mask
        
#         src = self.encoder(src)*math.sqrt(self.ninp)
#         src = self.pos_encoder(src)
#         output = self.transformer_encoder(src,self.src_mask)
#         output = self.decoder(output)
        
#         return output

In [5]:
# class PositionalEncoding(nn.Module):

#     def __init__(self,model,dropout=0.1,max_len=5000):
#         super(PositionalEncoding,self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         pe = torch.zeros(max_len,model)
#         position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0,model,2).float()*(-math.log(10000.0)/model))
#         pe[:,0::2] = torch.sin(position*div_term)
#         pe[:,1::2] = torch.cos(position*div_term)
#         pe = pe.unsqueeze(0).transpose(0,1)
#         self.register_buffer('pe',pe)

#     def forward(self,x):
#         x = x+self.pe[:x.size(0),:]
#         return self.dropout(x)

In [41]:
base_dir = "data"
train_file_X = os.path.join(base_dir,"train.source")
train_file_y = os.path.join(base_dir,"train.target")
test_file_X = os.path.join(base_dir,"test.source")
test_file_y = os.path.join(base_dir,"test.target")
val_file_X = os.path.join(base_dir,"val.source")
val_file_y = os.path.join(base_dir,"val.target")


In [9]:
import re
import string

STOP_WORDS = ["i", "a", "about", "an", "are", "as", "at", "be", "by", 
                "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", 
                "this", "to", "was", "what", "when", "where", "who", "will", "with"]

def ExpandContractions(contraction):

    contraction = re.sub(r"won\'t", "will not", contraction)
    contraction = re.sub(r"can\'t", "can not", contraction)

    contraction = re.sub(r"n\'t", " not", contraction)
    contraction = re.sub(r"\'re", " are", contraction)
    contraction = re.sub(r"\'s", " is", contraction)
    contraction = re.sub(r"\'d", " would", contraction)
    contraction = re.sub(r"\'ll", " will", contraction)
    contraction = re.sub(r"\'t", " not", contraction)
    contraction = re.sub(r"\'ve", " have", contraction)
    contraction = re.sub(r"\'m", " am", contraction)

    return contraction

def PreProcess(line):
    
    line = line.translate(str.maketrans("", "", string.punctuation))
    line = ExpandContractions(line)
    line = simple_preprocess(to_unicode(line))
    line = [cached_lemmatize(word) for word in line if word not in STOP_WORDS]

    line = " ".join(line)
    return line

In [36]:
class LineSentenceGenerator(object):

    def __init__(self, source, preprocess=None, max_sentence_length=10000, limit=None, preprocess_flag=True):
        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.input_files = []

        if preprocess != None and callable(preprocess) and preprocess_flag:
            self.preprocess = preprocess
        else:
            self.preprocess = lambda line: line.rstrip("\r\n")

        if isinstance(self.source, list):
            print('List of files given as source. Verifying entries and using.')
            self.input_files = [filename for filename in self.source if os.path.isfile(filename)]
            self.input_files.sort()  # makes sure it happens in filename order

        elif os.path.isfile(self.source):
            print('Single file given as source, rather than a list of files. Wrapping in list.')
            self.input_files = [self.source]  # force code compatibility with list of files

        elif os.path.isdir(self.source):
            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
            print('Directory of files given as source. Reading directory %s', self.source)
            self.input_files = os.listdir(self.source)
            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
            self.input_files.sort()  # makes sure it happens in filename order
        else:  # not a file or a directory, then we can't do anything with it
            raise ValueError('Input is neither a file nor a path nor a list')
        print('Files read into LineSentenceGenerator: %s' % ('\n'.join(self.input_files)))

        self.token_count = 0

    def __iter__(self):
        for file_name in self.input_files:
            print('Reading file %s', file_name)
            with open(file_name, 'rb') as fin:
                for line in itertools.islice(fin, self.limit):
                    line = self.preprocess(utils.to_unicode(line))
                    self.token_count += len(line)
                    i = 0
                    while i < len(line):
                        yield line[i:i + self.max_sentence_length]
                        i += self.max_sentence_length

    def __len__(self):
        if self.token_count > 0:
            return self.token_count
        else:
            return len(self.input_files)

    def __bool__(self):
        return self.has_data()

    def is_empty(self):
        return len(self.input_files) == 0

    def has_data(self):
        return not self.is_empty()

In [42]:
from torchtext.data import Dataset,Example
from torchtext.data import Field, BucketIterator

SRC = Field(tokenize = lambda x:x,
            lower = False)

TRG = Field(tokenize = lambda x:x,
            lower = False)

In [43]:
def read_data(X,y):
    examples = []
    fields = {'text': ('text', SRC),
              'summ': ('summ', TRG)}
    for x in LineSentenceGenerator(X,PreProcess):
        text_field = x
    for y in LineSentenceGenerator(y,Preprocess):
        summ_field = y
        
        e = Example.fromdict({"text": text_field, "summ": summ_field},
                             fields=fields)
        examples.append(e)
    print("examples: \n", examples[0])
    return Dataset(examples, fields=[('text', text_field), ('summ', summ_field)])

In [45]:
train_data = read_data(train_file_X,train_file_y)
# test_data = read_data(test_df)
# val_data = read_data(val_df)

TypeError: read_data() missing 1 required positional argument: 'y'

In [16]:
print("text: ",train_data[0].text)
print("summary: ",train_data[0].summ)

text:  ['editor', 'note', 'our', 'behind', 'scene', 'series', 'cnn', 'correspondent', 'share', 'their', 'experience', 'covering', 'news', 'and', 'analyze', 'story', 'behind', 'event', 'here', 'soledad', 'obrien', 'take', 'user', 'inside', 'jail', 'many', 'inmate', 'mentally', 'ill', 'inmate', 'housed', 'forgotten', 'floor', 'many', 'mentally', 'ill', 'inmate', 'housed', 'miami', 'before', 'trial', 'miami', 'florida', 'lrb', 'cnn', 'rrb', 'ninth', 'floor', 'miamidade', 'pretrial', 'detention', 'facility', 'dubbed', 'forgotten', 'floor', 'here', 'inmate', 'most', 'severe', 'mental', 'illness', 'incarcerated', 'until', 'they', 're', 'ready', 'appear', 'court', 'most', 'often', 'they', 'face', 'drug', 'charge', 'charge', 'assaulting', 'officer', 'charge', 'judge', 'steven', 'leifman', 'say', 'usually', 'avoidable', 'felony', 'he', 'say', 'arrest', 'often', 'result', 'confrontation', 'police', 'mentally', 'ill', 'people', 'often', 'wo', 'nt', 'do', 'they', 're', 'told', 'police', 'arrive', 

In [17]:
print("text: ", test_data[0].text)
print("summ: ",val_data[0].summ)

text:  ['marseille', 'france', 'lrb', 'cnn', 'rrb', 'french', 'prosecutor', 'leading', 'investigation', 'into', 'crash', 'germanwings', 'flight', 'insisted', 'wednesday', 'he', 'not', 'aware', 'any', 'video', 'footage', 'board', 'plane', 'marseille', 'prosecutor', 'brice', 'robin', 'told', 'cnn', 'so', 'far', 'no', 'video', 'were', 'used', 'crash', 'investigation', 'he', 'added', 'person', 'ha', 'such', 'video', 'need', 'immediately', 'give', 'investigator', 'robin', 'comment', 'follow', 'claim', 'two', 'magazine', 'german', 'daily', 'bild', 'and', 'french', 'paris', 'match', 'cell', 'phone', 'video', 'showing', 'harrowing', 'final', 'second', 'board', 'germanwings', 'flight', 'crashed', 'into', 'french', 'alp', 'all', 'board', 'were', 'killed', 'paris', 'match', 'and', 'bild', 'reported', 'video', 'recovered', 'phone', 'wreckage', 'site', 'two', 'publication', 'described', 'supposed', 'video', 'but', 'did', 'not', 'post', 'their', 'website', 'publication', 'said', 'they', 'watched', '

In [18]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

In [34]:

type(train_iterator)

for i in train_iterator:
    print(i)

AttributeError: 'str' object has no attribute 'is_target'

In [20]:
class TransformerSummarizer(nn.Module):
    
    def __init__(self,max_seq_len,trg, vocab_size, emb_dim, n_layers, nhead, dropout=0.1):
        
        super(TransformerSummarizer,self).__init__()
        
        self.max_seq_len = max_seq_len
        self.emb_dim = emb_dim
        self.n_layers = n_layers
        self.nhead = nhead
        self.dropout = dropout
        
        self.embed = nn.Embedding(vocab_size, emb_dim)
        
        self.model = Transformer(d_model = emb_dim, nhead = nhead, num_encoder_layers = n_layers, num_decoder_layers = n_layers )
        
        self.out_linear = nn.Linear(emb_dim, trg)
        
    def forward(src,trg):
        
        src = self.embed(src).transpose(0,1)
        
        trg = self.embed(trg).transpose(0,1)
        
        output = self.model(src,trg)
        
        output = self.out_linear(output)
        
        
        
        

        
        

In [27]:
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [21]:
trg = len(TRG.vocab)
EMB_DIM = 200
SEQ_LEN =400 # src:(400,128,emb_dim)


VOCAB_SIZE = len(SRC.vocab.stoi)
ATTENTION_HEADS = 8
N_LAYERS = 12

In [22]:
model = TransformerSummarizer(SEQ_LEN,trg, VOCAB_SIZE, EMB_DIM, N_LAYERS, ATTENTION_HEADS).to(device)

In [23]:
optimizer = optim.Adam(model.parameters())

In [24]:
import math
import time


def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [25]:
def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [31]:
for _,batch in enumerate(train_iterator):

AttributeError: 'str' object has no attribute 'is_target'

In [28]:
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

AttributeError: 'str' object has no attribute 'is_target'