In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install torch
!pip install plotly

In [None]:
!pip install pypdf[full]
!pip install nltk

In [3]:
!cp -r /content/drive/MyDrive/pdfs /content

In [13]:
from pypdf import PdfReader
import re
import torch
import os
from transformers import pipeline, AutoModelForSeq2SeqLM,AutoTokenizer
from tqdm.auto import tqdm

files = [f'/content/pdfs/{name}' for name in os.listdir('/content/pdfs')]

In [5]:
def clean_text(text):

  text = re.sub('@','',text)
  text = re.sub('http://\S+|https://\S+', '', text)
  text = re.sub('http[s]?://\S+', '', text)
  text = re.sub(r"http\S+", "", text)
  text = text.split('\n')
  text = ''.join([str(word) for word in text])
  text = text.split('\t')
  text = ''.join([str(word) for word in text if not word.isdigit()])
  
  return text

In [33]:
def get_text(file):

  reader = PdfReader(file)
  text = []
  for page in reader.pages:
    content = clean_text(page.extract_text())
    text.append(content)

  text = '\n'.join([chunk for chunk in text])

  return str(text)

In [34]:
def bulk_extract(files):

  texts = []

  for file in files:

    content = get_text(file)
    texts.append(content)

  return texts

In [35]:
# First Method HuggingFace Pipeline

def get_pipeline():

  summarizer = pipeline(
    "summarization",
    "pszemraj/long-t5-tglobal-base-16384-book-summary",
    device=0 if torch.cuda.is_available() else -1,
  )

  return summarizer

In [87]:
pipeline = get_pipeline()

In [88]:
def summarize(content):
  
  sum = pipeline(content)
  sum = sum[0]["summary_text"]

  return sum

In [18]:
# Second Way is More 'Manual'

def load_model_and_tokenizer():

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = AutoModelForSeq2SeqLM.from_pretrained('pszemraj/long-t5-tglobal-base-16384-book-summary').to(device)
  tokenizer = AutoTokenizer.from_pretrained('pszemraj/long-t5-tglobal-base-16384-book-summary')

  return model, tokenizer

In [74]:
def summarize_and_score(ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs):
 
    ids = ids[None, :]
    mask = mask[None, :]

    input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
    attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask

    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    if is_general_attention_model:
        summary_pred_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            output_scores=True,
            return_dict_in_generate=True,
            **kwargs,
        )
    else:
        summary_pred_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            output_scores=True,
            return_dict_in_generate=True,
            **kwargs,
        )
    summary = tokenizer.batch_decode(
        summary_pred_ids.sequences,
        skip_special_tokens=True,
        remove_invalid_values=True,
    )

    score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)

    return summary, score

In [75]:
def summarize_long(input_text, model, tokenizer, batch_length = 2048, batch_stride = 16):

  encode_input = tokenizer(
      input_text,
      padding = 'max_length',
      truncation = True,
      max_length = batch_length,
      stride = batch_stride,
      return_overflowing_tokens=True,
      add_special_tokens=False,
      return_tensors="pt",
   )

  input_ids, attention_masks = encode_input.input_ids, encode_input.attention_mask

  summaries = []

  pbar = tqdm(total=len(input_ids))

  for _id, _mask in zip(input_ids, attention_masks):

        result, score = summarize_and_score(
            ids=_id,
            mask=_mask,
            model=model,
            tokenizer=tokenizer,
        )
        score = round(float(score), 0)
        _sum = {
            "input_tokens": _id,
            "summary": result,
            "summary_score": score,
        }
        summaries.append(_sum)
        pbar.update()

  pbar.close()

  return summaries


In [21]:
model,tokenizer = load_model_and_tokenizer()

In [29]:
def get_best(results):

  score = results[0]['summary_score']
  best_one = results[0]['summary']

  for result in results:

    if result['summary_score'] > score:

      best_one = result['summary']

  return best_one

In [61]:
def get_median(results):

  scores = []

  for result in results:

    scores.append(result['summary_score'])

  median = round(float(len(scores)) / 2, 0)

  best_one = results[int(median)]

  return best_one['summary']


In [82]:
def bulk_summarize(files, method = 'pipeline', best_one = 'score'):

  contents = bulk_extract(files)
  summarizes = []

  if method == 'pipeline':

    for content in contents:

      result = summarize(str(content))
      summarizes.append(result)

  else:

    for content in contents:

      result = summarize_long(content,model,tokenizer)

      if best_one == 'score':

        result = get_best(result)

      else:

        result = get_median(result)

      summarizes.append(result)

  return summarizes    

In [70]:
def save_results(results):

  with open('output.txt', 'w') as f:

    for result in results:

      f.write(result[0] + '\n\n\n')

In [72]:
def apply(files_path, method, choose_method, save = True):

  files = [f'{files_path}/{file_name}' for file_name in os.listdir(files_path)]
  results = bulk_summarize(files, method = method, best_one = choose_method)

  if save == True:

    save_results(results)

  else:

    for result in results:

      print(result[0])


In [None]:
apply(files_path = '/content/pdfs', method = 'other', choose_method = 'median', save = True)