## 410 Final Project: Generating Summaries for News Articles
Aaron Kuhstoss, Shalin Mehta, and Aleksandra Grigortsuk

### Imports

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from langdetect import detect
import random 

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

 ### Data Preprocessing
 

In [32]:
# import the dataset
df = pd.read_csv("Latest_News.csv")
print(len(df))

# filtering dataset with English articles and non-NA 
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# Subset df to make it workable, since python is computationally slow
random.seed(410)
df_subset = df.sample(n=15000)

# Apply the language detection function to df
df_subset['detected_language'] = df_subset['content'].apply(detect_language)
english_articles = df_subset[df_subset['detected_language'] == 'en']

86560


In [33]:
print(len(english_articles))
print(english_articles['content'].str.len().mean())
print(english_articles['description'].str.len().mean())

print(english_articles['content'].isnull().sum())
print(english_articles['title'].isnull().sum())

# every article has content and title
# 1157 articles
# averege size of content is 2100 characters

(english_articles.head())

1157
2126.0319792566984
244.98260073260073
0
0


Unnamed: 0,title,link,keywords,creator,video_url,description,content,pubDate,full_description,image_url,source_id,detected_language
20062,Quebec coroner to hear from final witnesses in...,https://panow.com/2021/10/25/quebec-coroner-to...,,,,MONTREAL - A coroner's inquest into a suburban...,MONTREAL — A coroner’s inquest into a suburban...,2021-10-25 17:56:39,MONTREAL — A coroner’s inquest into a suburban...,https://s3.amazonaws.com/socast-superdesk/medi...,panow,en
13142,Kelly Stafford Thanks Detroit Fans After Husba...,https://www.si.com/nfl/2021/10/25/kelly-staffo...,"['Matthew Stafford', 'Wire', 'Detroit Lions', ...",['Joseph Salvador'],,Matthew Stafford threw for 334 yards and three...,Matthew Stafford threw for 334 yards and three...,2021-10-25 21:04:43,"Rams quarterback Matthew Stafford's wife, Kell...",http://www.si.com/.image/c_limit%2Ccs_srgb%2Cf...,si,en
60887,Ricciardo accused of 'dirty' driving by rival,https://wwos.nine.com.au/motorsport/f1-2021-da...,,['wwos'],,Carlos Sainz and Daniel Ricciardo banged wheel...,Australia's Daniel Ricciardo has been accused ...,2021-10-24 22:53:20,Australia's Daniel Ricciardo has been accused ...,https://vms-network-images-prod.s3-ap-southeas...,nine,en
8072,Dana White addresses Nate Diaz possibly leavin...,https://www.mmafighting.com/2021/10/25/2274587...,,['Damon Martin'],,,Free agency has become much more commonplace i...,2021-10-26 01:00:00,Free agency has become much more commonplace i...,,mmafighting,en
1098,Digital University Kerala professor features i...,https://www.edexlive.com/news/2021/oct/26/digi...,,['Edex Live'],,"Under DUK, Dr James is the professor in charge...",The Associate Dean (Academic) of the Digital U...,2021-10-26 06:01:00,The Associate Dean (Academic) of the Digital U...,https://images.edexlive.com/uploads/user/image...,edexlive,en


### Pipeline Construction
1. Summarization pipeline
2. Categorization pipline (optional) -- not possible without preassigned labels
*does not need to be fully implemented by 11/1 milestone*

In [4]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

### Creating Summaries

Takes about 10 minutes. May need to further reduce the df.

In [67]:
#print(summarize(english_articles.iloc[0,6], 0.1))
#print(english_articles.iloc[0,6])

summaries = []
for i in range(len(english_articles)):
    summary = summarize(english_articles.iloc[i,6], 0.1)
    if summary:
        summaries.append(summary)

In [69]:
summaries[:10]

['MONTREAL — A coroner’s inquest into a suburban Montreal long-term care home where 47 people died during the pandemic’s first wave is hearing from a handful of witnesses this week before it concludes.',
 'Stafford was drafted by the Lions with the No. 1 pick in the 2009 draft and leads the franchise in passing yards, passing touchdowns and completions.\xa0',
 'Eels sign boom rookie"It didn\'t look too blatant to me, Carlos, taking a bit of a lunge around the outside, I don\'t think I saw a lot of relative movement," he said.The pair were battling for fifth spot on lap 43 of the 56-lap race when Sainz attempted to pass Ricciardo around the outside of a right-handed corner.',
 '“As far as Nate Diaz goes, I don’t know where Nate feels he is right now and what he feels his future could be if he signs another four- or five-fight deal here, “ White said.“Junior Dos Santos said some stuff the other day that he was upset, and I was all about money and the way that he left [the UFC],” White sa

### Model Evaluation