In [45]:
import re
import string 
import os
import math
import itertools

from tqdm import tqdm
from functools import lru_cache

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerDecoder,TransformerDecoderLayer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import Transformer

import gensim
import gensim.utils as utils
import gensim.downloader as api
from gensim.models import KeyedVectors

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
cached_lemmatize = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize)
from gensim.utils import simple_preprocess, to_unicode

STOP_WORDS = ["i", "a", "about", "an", "are", "as", "at", "be", "by", 
                "for", "from", "how", "in", "is", "it", "of", "on", "or", "that", "the", 
                "this", "to", "was", "what", "when", "where", "who", "will", "with"]

def ExpandContractions(contraction):

    contraction = re.sub(r"won\'t", "will not", contraction)
    contraction = re.sub(r"can\'t", "can not", contraction)

    contraction = re.sub(r"n\'t", " not", contraction)
    contraction = re.sub(r"\'re", " are", contraction)
    contraction = re.sub(r"\'s", " is", contraction)
    contraction = re.sub(r"\'d", " would", contraction)
    contraction = re.sub(r"\'ll", " will", contraction)
    contraction = re.sub(r"\'t", " not", contraction)
    contraction = re.sub(r"\'ve", " have", contraction)
    contraction = re.sub(r"\'m", " am", contraction)

    return contraction

def PreProcess(line):
    
    line = line.translate(str.maketrans("", "", string.punctuation))
    line = ExpandContractions(line)
    line = simple_preprocess(to_unicode(line))
    line = [cached_lemmatize(word) for word in line if word not in STOP_WORDS]

    line = " ".join(line)
    return line

class LineSentenceGenerator(object):

    def __init__(self, source, preprocess=None, max_sentence_length=10000, limit=None, preprocess_flag=True):
        self.source = source
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.input_files = []

        if preprocess != None and callable(preprocess) and preprocess_flag:
            self.preprocess = preprocess
        else:
            self.preprocess = lambda line: line.rstrip("\r\n")

        if isinstance(self.source, list):
            print('List of files given as source. Verifying entries and using.')
            self.input_files = [filename for filename in self.source if os.path.isfile(filename)]
            self.input_files.sort()  # makes sure it happens in filename order

        elif os.path.isfile(self.source):
            print('Single file given as source, rather than a list of files. Wrapping in list.')
            self.input_files = [self.source]  # force code compatibility with list of files

        elif os.path.isdir(self.source):
            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
            print('Directory of files given as source. Reading directory %s', self.source)
            self.input_files = os.listdir(self.source)
            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
            self.input_files.sort()  # makes sure it happens in filename order
        else:  # not a file or a directory, then we can't do anything with it
            raise ValueError('Input is neither a file nor a path nor a list')
        print('Files read into LineSentenceGenerator: %s' % ('\n'.join(self.input_files)))

        self.token_count = 0

    def __iter__(self):
        for file_name in self.input_files:
            print('Reading file %s', file_name)
            with open(file_name, 'rb') as fin:
                for line in itertools.islice(fin, self.limit):
                    line = self.preprocess(utils.to_unicode(line))
                    self.token_count += len(line)
                    i = 0
                    while i < len(line):
                        yield line[i:i + self.max_sentence_length]
                        i += self.max_sentence_length

    def __len__(self):
        if self.token_count > 0:
            return self.token_count
        else:
            return len(self.input_files)

    def __bool__(self):
        return self.has_data()

    def is_empty(self):
        return len(self.input_files) == 0

    def has_data(self):
        return not self.is_empty()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content'

In [47]:
root_path = "/content/drive/My Drive/Colab Notebooks/CMPE 297-03/"
data_dir = "data"
data_path = os.path.join(root_path,data_dir)
train_1000_path = os.path.join(root_path,data_dir,"train_1000.json")
full_train_file = os.path.join(root_path,data_dir,"train.pkl")
dic_path = os.path.join(root_path,data_dir,"dic.pkl")
test_path = os.path.join(root_path,data_dir,"test.pkl")
val_path = os.path.join(root_path,data_dir, "valid.pkl" )

print(train_1000_path,full_train_file,dic_path,test_path,val_path,sep="\n")

/content/drive/My Drive/Colab Notebooks/CMPE 297-03/data/train_1000.json
/content/drive/My Drive/Colab Notebooks/CMPE 297-03/data/train.pkl
/content/drive/My Drive/Colab Notebooks/CMPE 297-03/data/dic.pkl
/content/drive/My Drive/Colab Notebooks/CMPE 297-03/data/test.pkl
/content/drive/My Drive/Colab Notebooks/CMPE 297-03/data/valid.pkl


In [0]:
# pretrained_embeddings = api.load("fasttext-wiki-news-subwords-300")

In [0]:
# !wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz 
# !tar -xzvf cnn_dm.tgz

In [0]:
cnn_dailymail_path = os.path.join(os.getcwd(), "cnn_dm/")

X_train_path = os.path.join(cnn_dailymail_path, "train.source")
y_train_path = os.path.join(cnn_dailymail_path, "train.target")
X_test_path = os.path.join(cnn_dailymail_path, "test.source")
y_test_path = os.path.join(cnn_dailymail_path, "test.target")
X_val_path = os.path.join(cnn_dailymail_path, "val.source")
y_val_path = os.path.join(cnn_dailymail_path, "val.target")



In [54]:
for i, line in enumerate(LineSentenceGenerator(X_train_path, preprocess=None)):
  print(line)
  if i == 5:
    break

for i, line in enumerate(LineSentenceGenerator(X_train_path, preprocess=PreProcess)):
  print(line)
  if i == 5:
    break

Single file given as source, rather than a list of files. Wrapping in list.
Files read into LineSentenceGenerator: /content/cnn_dm/train.source
Reading file %s /content/cnn_dm/train.source
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they're ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people

In [0]:
import pandas as pd
train_1000_df = pd.read_json(train_1000_path, orient=str)

In [0]:
train_1000_df

Unnamed: 0,text-tokens,text,summ-tokens,summ
0,"[editor, 's, note, :, in, our, behind, the, sc...",editor 's note : in our behind the scenes seri...,"[mentally, ill, inmates, in, miami, are, house...",[ mentally ill inmates in miami are housed on ...
1,"[london, ,, england, -lrb-, reuters, -rrb-, --...","london , england -lrb- reuters -rrb- -- harry ...","[harry, potter, star, daniel, radcliffe, gets,...",[ harry potter star daniel radcliffe gets # 20...
2,"[minneapolis, ,, minnesota, -lrb-, cnn, -rrb-,...","minneapolis , minnesota -lrb- cnn -rrb- -- dri...","[new, :, ``, i, thought, i, was, going, to, di...","[ new : `` i thought i was going to die , '' d..."
3,"[baghdad, ,, iraq, -lrb-, cnn, -rrb-, --, dres...","baghdad , iraq -lrb- cnn -rrb- -- dressed in a...","[parents, beam, with, pride, ,, ca, n't, stop,...","[ parents beam with pride , ca n't stop from s..."
4,"[washington, -lrb-, cnn, -rrb-, --, doctors, r...",washington -lrb- cnn -rrb- -- doctors removed ...,"[five, small, polyps, found, during, procedure...",[ five small polyps found during procedure ; `...
...,...,...,...,...
995,"[-lrb-, cnn, -rrb-, --, the, united, states, b...",-lrb- cnn -rrb- -- the united states believes ...,"[u.s., intelligence, points, to, pakistan, age...",[ u.s. intelligence points to pakistan agents ...
996,"[new, york, -lrb-, cnn, -rrb-, --, when, the, ...",new york -lrb- cnn -rrb- -- when the emperors ...,"[new, york, times, :, court, documents, identi...",[ new york times : court documents identify th...
997,"[-lrb-, cnn, -rrb-, --, texas, authorities, ar...",-lrb- cnn -rrb- -- texas authorities are inves...,"[new, :, state, child, protective, services, a...",[ new : state child protective services agents...
998,"[kampala, ,, uganda, -lrb-, cnn, -rrb-, --, at...","kampala , uganda -lrb- cnn -rrb- -- at least 1...","[19, schoolgirls, and, two, adults, die, in, p...",[ 19 schoolgirls and two adults die in primary...


In [0]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout=0.5):
        
        super(BiLSTMEncoder,self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear( enc_hid_dim * 2, dec_hid_dim )
        
        self.dropout = nn.Dropout( dropout )
        
    def forward(self, X):
        
        embedded = self.dropout(self.embedding(X))
        
        outputs, hidden = self.rnn(embedded)
        
        hidden = F.tanh( self.fc ( torch.cat( (hidden[-2,:,:], hidden[-1, : , : ] ), dim = 1 ) ) )
        
        return outputs, hidden


        
        

In [0]:
class PositionalEncoding(nn.Module):

    def __init__(self,model,dropout=0.1,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len,model)
        position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,model,2).float()*(-math.log(10000.0)/model))
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe',pe)

    def forward(self,x):
        x = x+self.pe[:x.size(0),:]
        return self.dropout(x)

In [0]:
class TransformerSummarizer(nn.Module):
    """
    Transformer Encoder with self attention layers.
    """
    def __init__(self, max_seq_len, ntoken, emb_size, nhead, nhid, nlayers, dropout=0.5):
        """
        :param max_seq_len : maximum sequence length
        :param ntoken: size of vocab
        :param ninp
        :param nhead
        :param nhid
        :param nlayers
        :param dropout: 0.5 by default
        """
        super(TransformerSummarizer,self).__init__()
        self.model_type = 'Summarizer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(emb_size,dropout)

        encoder_layers = TransformerEncoderLayer(emb_size ,nhead ,nhid , dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,nlayers)
        
        self.emb_size = emb_size
        self.encoder = nn.Embedding(ntoken, emb_size)
        
        decoder_layers = TransformerDecoderLayer (emb_size, nhead, nhid,dropout) 
        self.decoder = TransformerDecoder (decoder_layers, nlayers)
        
        # self.init_weights()


    def generate_square_mask(self,sz):
        mask = (torch.triu(torch.ones(sz,sz)) == 1).transpose(0,1)
        mask = mask.float().masked_fill_(mask == 0,float('-inf')).masked_fill_(mask == 1,float(0.0))
        return mask


    # def init_weights(self):
    #     initrange = 0.1
    #     self.encoder.weight.data.uniform_(-initrange,initrange)
    #     # self.decoder.bias.data.zero_()
    #     self.decoder.weight.data.uniform_(-initrange,initrange)

    def forward(self,src,tgt,src_mask=None,tgt_mask=None,
                memory_mask=None,src_key_padding_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        
        return output

In [0]:
# import torchtext
# from torchtext.data.utils import get_tokenizer
# TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
#                             init_token='<sos>',
#                             eos_token='<eos>',
#                             lower=True)
# train_txt, val_txt, test_txt = 
# TEXT.build_vocab(train_txt)

In [0]:
train_line = train_1000_df["text-tokens"][0]
print(train_line)
vocab = len(set(train_line))

test_line = train_1000_df["summ-tokens"][0]
print(test_line)
# vocab = len(set(test_line))



['editor', "'s", 'note', ':', 'in', 'our', 'behind', 'the', 'scenes', 'series', ',', 'cnn', 'correspondents', 'share', 'their', 'experiences', 'in', 'covering', 'news', 'and', 'analyze', 'the', 'stories', 'behind', 'the', 'events', '.', 'here', ',', 'soledad', "o'brien", 'takes', 'users', 'inside', 'a', 'jail', 'where', 'many', 'of', 'the', 'inmates', 'are', 'mentally', 'ill', '.', 'an', 'inmate', 'housed', 'on', 'the', '``', 'forgotten', 'floor', ',', "''", 'where', 'many', 'mentally', 'ill', 'inmates', 'are', 'housed', 'in', 'miami', 'before', 'trial', '.', 'miami', ',', 'florida', '-lrb-', 'cnn', '-rrb-', '--', 'the', 'ninth', 'floor', 'of', 'the', 'miami-dade', 'pretrial', 'detention', 'facility', 'is', 'dubbed', 'the', '``', 'forgotten', 'floor', '.', "''", 'here', ',', 'inmates', 'with', 'the', 'most', 'severe', 'mental', 'illnesses', 'are', 'incarcerated', 'until', 'they', "'re", 'ready', 'to', 'appear', 'in', 'court', '.', 'most', 'often', ',', 'they', 'face', 'drug', 'charges'

In [0]:
test_model = TransformerSummarizer(max_seq_len=10, ntoken=vocab, emb_size=300, nhead=6, nhid=2048, nlayers=6)


In [0]:
train_line = train_1000_df["text"][0]
print(train_line)
vocab = len(set(train_line))

test_line = train_1000_df["summ"][0]
print(test_line)
# vocab = len(set(test_line))



editor 's note : in our behind the scenes series , cnn correspondents share their experiences in covering news and analyze the stories behind the events . here , soledad o'brien takes users inside a jail where many of the inmates are mentally ill . an inmate housed on the `` forgotten floor , '' where many mentally ill inmates are housed in miami before trial . miami , florida -lrb- cnn -rrb- -- the ninth floor of the miami-dade pretrial detention facility is dubbed the `` forgotten floor . '' here , inmates with the most severe mental illnesses are incarcerated until they 're ready to appear in court . most often , they face drug charges or charges of assaulting an officer -- charges that judge steven leifman says are usually `` avoidable felonies . '' he says the arrests often result from confrontations with police . mentally ill people often wo n't do what they 're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid ,

In [0]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
t5_type = "t5-small"

In [0]:
import sys
!{sys.executable} -m pip install transformers

from transformers import T5ForConditionalGeneration, T5Tokenizer   



In [0]:
t5model = T5ForConditionalGeneration.from_pretrained(t5_type)

HBox(children=(IntProgress(value=0, description='Downloading', max=1197, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Downloading', max=242136741, style=ProgressStyle(description_…




In [0]:
tokenizer = T5Tokenizer.from_pretrained(t5_type)

In [0]:
parameters = t5model.config.task_specific_params
if parameters is not None:
    t5model.config.update(parameters.get("summarization", {}))

for batch in [train_line]:
    batch = [t5model.config.prefix + text for text in batch]

    dct = tokenizer.batch_encode_plus(batch, max_length=512, return_tensors="pt", pad_to_max_length=True)
    input_ids = dct["input_ids"]#.to(device)
    attention_mask = dct["attention_mask"]#.to(device)

    summaries = t5model.generate(input_ids=input_ids, attention_mask=attention_mask)
    dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]


In [0]:
dec

In [0]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('bart-large')


In [0]:
print(train_line)
ARTICLE_TO_SUMMARIZE = PreProcess(train_line)
inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt', pad_to_max_length=True)

max_length = 100
min_length = 50

summaries = model.generate(input_ids=inputs['input_ids'], 
                            attention_mask=inputs["attention_mask"], 
                            num_beams=8, 
                            max_length=max_length + 2,  
                            min_length=min_length + 1, 
                            no_repeat_ngram_size=3,
                            early_stopping=True,
                            decoder_start_token_id=model.config.eos_token_id)

outputs = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]

editor 's note : in our behind the scenes series , cnn correspondents share their experiences in covering news and analyze the stories behind the events . here , soledad o'brien takes users inside a jail where many of the inmates are mentally ill . an inmate housed on the `` forgotten floor , '' where many mentally ill inmates are housed in miami before trial . miami , florida -lrb- cnn -rrb- -- the ninth floor of the miami-dade pretrial detention facility is dubbed the `` forgotten floor . '' here , inmates with the most severe mental illnesses are incarcerated until they 're ready to appear in court . most often , they face drug charges or charges of assaulting an officer -- charges that judge steven leifman says are usually `` avoidable felonies . '' he says the arrests often result from confrontations with police . mentally ill people often wo n't do what they 're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid ,

In [0]:
for output in outputs:
  print(output)

Nineth floor miamidade pretrial detention facility dubbed forgotten floor here inmate most severe mental illness incarcerated until they re ready appear court most often they face drug charge charge assaulting officer charge judge steven leifman say usually avoidable felony he say arrest often result confrontation police mentally ill people often wo nt do they re told police arrive scene confrontation seems exacerbate their illness and they become more paranoid delusional.
