# **Updates Cell**

*18th May, 2022*

This model was chosen since it was pretrained on news summarization datasets (CNN & Dailymail).

The BART model only accepts tokens with size of 512 (it was pretrained on this size). We can divide each article to batches of 512 tokens and concantenate them later. To adjust tokens size to 2048 the model will need to be retrained.



In [None]:
!pip install transformers

In [None]:
!pip install nltk

In [None]:
!pip freeze

In [None]:
import pandas as pd
import re
import torch
from transformers import BartForConditionalGeneration, BartConfig, BartTokenizer
import nltk

In [None]:
df = pd.read_csv('/content/samples')
news = df.copy()
news.columns = ['sk', 'url', 'title', 'text', 'tags', 'count', 'date', 'summary']
test = news.loc[1:5]

#convert the articles column to a list for summarization
#this was done to avoid using apply as it has a big running time

articles = test['text'].tolist()
nums = test['count'].tolist()
news = news.sort_values(by = ['count'])

In [None]:


#Intilizing the BART model 
model =  BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

#intializing text tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")


#The following piece of code summarizes the articles
#NOTE: the model accepts a maximum length of 512 words, so large articles
#are summarized iterativley

sums = []
temp = ''
num = 0
count = 0
for article in articles:
  if nums[num] <= 100:
    sums.append(article)
    num += 1
  
  else:
    while(nums[num] > 10):
        inputs = tokenizer([article[count: count +  512]], max_length = 512, return_tensors="pt")
        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=50)
        temp +=  tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        nums[num] -= 512
        count += 512
    print(temp + '\n')
    num += 1
    count = 0
    sums.append(temp)
    temp = ''



newspaper = {'nytimes':'new york times', 'washingtonpost':'washington post', 'theguardian':'the guardian'}

#This function creates a sperate column contains the website name
def paper(df):
  col = df['url'].lower()
  for k in newspaper.keys():
    n = re.search(k, col)
    if n != None:
      return re.sub(col,newspaper[k], col)

#This fucntion cleans the title from non-alphanumeric characters and removes
#the website name from the title
def cleaner(df):
  col = df['title'].lower()
  for k in newspaper:
    if k in col:
      col = col - k
  return ''.join(char for char in col if char.isalnum() or char == ' ')


def text_cleaner(df):
  c = df['text'].lower()
  #remove words between brackets
  c = re.sub("[\(\[].*?[\)\]]", '', c)
  #remove all special characters and numbers from main text
  return str(''.join(char for char in c if char.isalpha() or char == ' '))


#Create a paper column
news['paper'] = news.apply(paper, axis = 1)

#Clean title column
news['title_new'] = news.apply(cleaner, axis = 1)
news['title'] = news['title_new']
news = news.drop(['title_new'], axis = 1)

print(test.head(10))


In [None]:
"""
There are sum patterns in the summarized text needs removal:
1. ''
"""