## 410 Final Project: Generating Summaries for News Articles
Aaron Kuhstoss, Shalin Mehta, and Aleksandra Grigortsuk

### Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from langdetect import detect
import random 

import torch
import rouge
from rouge import Rouge
from bert_score import score

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

 ### Data Preprocessing
 

In [3]:
# import the dataset
df = pd.read_csv("Latest_News.csv")
print(len(df))

# filtering dataset with English articles and non-NA 
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# Subset df to make it workable, since python is computationally slow
#random.seed(410)
#df_subset = df.sample(n=15000)

# For my purposes I'll take a subset of the same 15000 for consistency
df_subset = df.sample(n=15000,random_state=410) # random_state should stay consistent each time as long as we use the same csv file

# Apply the language detection function to df
df_subset['detected_language'] = df_subset['content'].apply(detect_language)
english_articles = df_subset[df_subset['detected_language'] == 'en']

86560


In [4]:
print(len(english_articles))
print(english_articles['content'].str.len().mean())
print(english_articles['description'].str.len().mean())

print(english_articles['content'].isnull().sum())
print(english_articles['title'].isnull().sum())

# every article has content and title
# 1157 articles
# averege size of content is 2100 characters

(english_articles.head())

1153
2247.8187337380746
222.00648148148147
0
0


Unnamed: 0,title,link,keywords,creator,video_url,description,content,pubDate,full_description,image_url,source_id,detected_language
57562,Succession Recap: Relevant Donuts,http://www.vulture.com/article/succession-seas...,"['tv', 'tv recaps', 'overnights', 'recaps', 's...",['Scott Tobias'],,The Roy children consider taking “a pop at the...,"In “Austerlitz,” the seventh episode of Succes...",2021-10-25 02:02:09,,,vulture,en
64707,Giants stifle Panthers in impressive win,https://nationalpost.com/pmn/sports-pmn/giants...,,['Reuters'],,Daniel Jones threw a touchdown pass and the Ne...,Daniel Jones threw a touchdown pass and the Ne...,2021-10-24 20:21:11,Daniel Jones threw a touchdown pass and the Ne...,,nationalpost,en
22871,The original iPod ‘prototype’ was an Apple des...,https://www.theverge.com/2021/10/25/22744761/a...,,['Emma Roth'],,,Photo by Panic It’s hard to believe that the p...,2021-10-25 16:53:42,,,theverge,en
75223,3 Best Ways to Invest for Retirement,https://www.fool.com/retirement/2021/10/24/3-b...,,['newsfeedback@fool.com (Chuck Saletta)'],,Taking advantage of these specialized retireme...,Covering your costs in your retirement will pr...,2021-10-24 14:30:00,Covering your costs in your retirement will pr...,,fool,en
62191,Verstappen gamble sets up thrilling US GP finale,https://wwos.nine.com.au/motorsport/f1-2021-ma...,,['wwos'],,"Max Verstappen stopped early for tyres, settin...",Max Verstappen held off Formula One title riva...,2021-10-24 21:50:15,Max Verstappen held off Formula One title riva...,https://vms-network-images-prod.s3-ap-southeas...,nine,en


### Pipeline Construction
1. Summarization pipeline
2. Categorization pipline (optional) -- not possible without preassigned labels
*does not need to be fully implemented by 11/1 milestone*

In [6]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

### Creating Summaries

Takes about 10 minutes. May need to further reduce the df.

In [7]:
#print(summarize(english_articles.iloc[0,6], 0.1))
#print(english_articles.iloc[0,6])

summaries = []
raw_articles = []
#for i in range(len(english_articles)):
#    summary = summarize(english_articles.iloc[i,6], 0.1)
#    if summary:
#        summaries.append(summary)

# For the evaluation we don't need too many summaries, so I'll just do five for now
index = 500
while len(summaries) < 5:
    article = english_articles.iloc[index,:]
    article_content = english_articles.iloc[index,6]
    summary = summarize(article_content, 0.1)
    if summary:
        summaries.append(summary)
        raw_articles.append(article)
    index += 1

In [8]:
summaries[:5]

['pic.twitter.com/QxoekDCB9H— Breaking Bad (@BreakingBad) February 11, 2020 RELATED: ‘Breaking Bad’: 5 Walter White Moments That Pushed Him Beyond Redemption Although most of the talking heads at AMC still saw Cranston as his fatherly persona from Malcolm in the Middle, Gilligan knew he had a wider acting range.Despite being a morally grey character — veering toward the darker side of the spectrum, if we’re being honest — Cranson’s character stuck with fans.These 2 actors could have played Walter White instead RELATED: ‘Breaking Bad’: Bryan Cranston Doesn’t Miss Playing Walter White for This Reason With Cranston becoming the face of Breaking Bad, it’s hard to imagine the show having another lead.',
 'If you’ve been on TikTok in recent weeks, chances are that you’ve seen the term ‘weaponised incompetence’ (or weaponized) floating around a fair amount.Psychotherapist and writer, Emily Mendez, M.S. EdS, recently told Bustle that “Weaponized incompetence refers to pretending not to know ho

### Model Evaluation

In [9]:
# Looking at summaries and their associated articles

## NOTE: One fundamental issue we're dealing with is the fact that rouge and bleu need high qualtiy reference summaries to compare to,
## if we use the "description" field as a pseudo summary, we can get a good amount of reference data, however the quality of these descriptions
## varies, and some are null. We can also try writing summaries by hand to compare to, but this is time-consuming and limits us to fewer data points.

# Obtain original descriptions for use as reference "summaries"
descriptions = []

for row in raw_articles:
    descriptions.append(row.iloc[5])

# Get raw content of articles for BERTscoring
orig_articles = []
for row in raw_articles:
   orig_articles.append(row.iloc[6])

# Display raw article content and descriptions
#print('Raw Articles:','\n'.join(orig_articles))
#print('Descriptions:','\n'.join(descriptions))

# For sake of qualitative evaluation, display summaries
#print('Summaries:','\n'.join(summaries))

# Also write hand-written summaries
own_summ1 = "Bryan Cranstons portrayal of the morally gray character Walter White in the hit series Breaking Bad has been loved by fans. However, AMC had other choices for the lead, with Breaking Bad creator Vince Gilligan ultimately persuading executives to choose Cranston."
own_summ2 = "Recently on TikTok, the term weaponized incompetence is gaining a lot of attention. According to psychotherapist and writer, Emily Mendez, M.S. EdS, “Weaponized incompetence refers to pretending not to know how to do something when you do really know how to do it.” The term has 21.8M views on TikTok as example, mostly of women, whose colleagues, partners, and family members use weaponized incompetence to get out of work."
own_summ3 = "The All India Congress is going to launch a country wide protest from November 14 against the abnormal rise of fuel. The massive protest against the high fuel price will start from November 14 and will continue till November 29, after five consecutive days of rising fuel prices across the country."
own_summ4 = "Numerous artists across multiple genres, such as Lil Nas X, Ariana Grande, and Olivia Rodrigo are entering songs for consideration in the upcoming Grammy award season. This includes Justin Bieber, whose smash hit “Peaches” (featuring Daniel Caesar and Giveon) is vying for a Grammy nomination as best R&B performance."
own_summ5 = "Singer-songwriter Ed Sheeran announced Sunday he had tested positive for COVID-19 and would be self-isolating in his home five days before he is scheduled to release his fourth studio album. Sheeran's upcoming album, titled '=' is scheduled to be released on October 29."

own_summ = [own_summ1,own_summ2,own_summ3,own_summ4,own_summ5]

# Return Rouge and BERTscores for all summaries passed in
def eval_summaries(summaries,references,content):
    # Initialize the ROUGE object
    rouge = Rouge()

    rouge_scores = [rouge.get_scores(summaries, references)]

    for index, score_set in enumerate(rouge_scores[0]):
        print(f"Summary {index + 1}:")
        for rouge_key, values in score_set.items():
            print(f"  {rouge_key.upper()}:")
            for metric, value in values.items():
                print(f"    {metric.capitalize()}: {value:.4f}")
        print()

    ## For ROUGE scores, each n-gram returns precision, recall, and f (f1) scores. 
    ## Rouge works on n-gram, so rouge-1 looks at single words (unigram), rouge-2 at two-word sequences (bigrams). Rouge-l looks at 

    # Next get BERTscore
    P,R,F1 = score(summaries,content,lang='en')

    # Print results
    for i in range(len(summaries)):
        print(f"Summary {i+1}: Precision: {P[i].item()}, Recall: {R[i].item()}, F1: {F1[i].item()}")

eval_summaries(summaries,descriptions,orig_articles)
eval_summaries(summaries,own_summ,orig_articles)

# Function for averaging scores?

Summary 1:
  ROUGE-1:
    R: 0.2857
    P: 0.0870
    F: 0.1333
  ROUGE-2:
    R: 0.0345
    P: 0.0093
    F: 0.0147
  ROUGE-L:
    R: 0.2143
    P: 0.0652
    F: 0.1000

Summary 2:
  ROUGE-1:
    R: 0.4583
    P: 0.3492
    F: 0.3964
  ROUGE-2:
    R: 0.3019
    P: 0.2192
    F: 0.2540
  ROUGE-L:
    R: 0.4583
    P: 0.3492
    F: 0.3964

Summary 3:
  ROUGE-1:
    R: 0.0000
    P: 0.0000
    F: 0.0000
  ROUGE-2:
    R: 0.0000
    P: 0.0000
    F: 0.0000
  ROUGE-L:
    R: 0.0000
    P: 0.0000
    F: 0.0000

Summary 4:
  ROUGE-1:
    R: 0.4231
    P: 0.0924
    F: 0.1517
  ROUGE-2:
    R: 0.1481
    P: 0.0280
    F: 0.0471
  ROUGE-L:
    R: 0.4231
    P: 0.0924
    F: 0.1517

Summary 5:
  ROUGE-1:
    R: 0.2778
    P: 0.1724
    F: 0.2128
  ROUGE-2:
    R: 0.0000
    P: 0.0000
    F: 0.0000
  ROUGE-L:
    R: 0.1667
    P: 0.1034
    F: 0.1277



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Summary 1: Precision: 0.8928447961807251, Recall: 0.8430277109146118, F1: 0.8672213554382324
Summary 2: Precision: 0.9354150295257568, Recall: 0.8361769914627075, F1: 0.8830165266990662
Summary 3: Precision: 0.95087730884552, Recall: 0.8304088115692139, F1: 0.8865693807601929
Summary 4: Precision: 0.8712972402572632, Recall: 0.7973340153694153, F1: 0.832676351070404
Summary 5: Precision: 0.93096524477005, Recall: 0.8200744390487671, F1: 0.8720085620880127
Summary 1:
  ROUGE-1:
    R: 0.4865
    P: 0.1957
    F: 0.2791
  ROUGE-2:
    R: 0.1026
    P: 0.0374
    F: 0.0548
  ROUGE-L:
    R: 0.3784
    P: 0.1522
    F: 0.2171

Summary 2:
  ROUGE-1:
    R: 0.5273
    P: 0.4603
    F: 0.4915
  ROUGE-2:
    R: 0.3881
    P: 0.3562
    F: 0.3714
  ROUGE-L:
    R: 0.4909
    P: 0.4286
    F: 0.4576

Summary 3:
  ROUGE-1:
    R: 0.0270
    P: 0.0417
    F: 0.0328
  ROUGE-2:
    R: 0.0000
    P: 0.0000
    F: 0.0000
  ROUGE-L:
    R: 0.0270
    P: 0.0417
    F: 0.0328

Summary 4:
  ROUGE-1:
    R

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Summary 1: Precision: 0.8928447961807251, Recall: 0.8430277109146118, F1: 0.8672213554382324
Summary 2: Precision: 0.9354150295257568, Recall: 0.8361769914627075, F1: 0.8830165266990662
Summary 3: Precision: 0.95087730884552, Recall: 0.8304088115692139, F1: 0.8865693807601929
Summary 4: Precision: 0.8712972402572632, Recall: 0.7973340153694153, F1: 0.832676351070404
Summary 5: Precision: 0.93096524477005, Recall: 0.8200744390487671, F1: 0.8720085620880127
