In [1]:
import numpy as np
import pandas as pd
import sklearn
from random import sample
import re

# for summarization
from transformers import TFXLNetForSequenceClassification, XLNetTokenizer, T5Tokenizer, TFT5ForConditionalGeneration, PegasusTokenizer, TFPegasusForConditionalGeneration
import datetime
import tensorflow as tf
from newspaper import Article, Config
from heapq import nlargest
from GoogleNews import GoogleNews
from googlesearch import search
from bs4 import BeautifulSoup
import requests

# for partial matching strings
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# for document similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein-wheels

In [3]:
# !pip install beautifulsoup4
# !pip install google
# !pip install newspaper3k
# !pip install GoogleNews

In [4]:
#!pip install --upgrade transformers  --user

## allsides webscraping to get bias

In [5]:
# put the news sources and biases in a dataframe: data
data = pd.read_csv('./bias_df.csv')

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,source,website,bias
0,0,ABC News (Online),https://abcnews.go.com,left
1,1,AlterNet,https://www.alternet.org,left
2,2,Associated Press,https://apnews.com,center
3,3,Axios,https://www.axios.com,center
4,4,BBC News,https://www.bbc.com,center


## Alternative Bias function

In [7]:
def get_alternative_bias(article_bias):
    """Gets the other biases from the article bias
    
       input: string, the bias of the article - options: left, center, right
       output: list, of the alternative biases. 
       eg. get_opposite_bias('right') returns ['left', 'center']"""
    biases = ['left', 'center', 'right']
    try:
        biases.remove(article_bias)
        
    except ValueError:
        # no bias, return list of just center
        biases = ['center']
    return biases

## Short Summary function

In [8]:
# pegasus model loaded
pegasus_model = TFPegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


In [9]:
def short_summary(article_url):
    """
    Creates a short summary of the article and gets the month and year article published
    for the search query for related articles
    
   input:  string, url of the article
   output: string, short summary of the article
           string, month and year that the article was published in
    """
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        txt = article.text
        try:
            pub_date = article.publish_date
            month_yr = pub_date.strftime("%B") + " " + str(pub_date.year)
        except:
            month_yr = ""
            print('no published date')
        pegasus_input = pegasus_tokenizer([txt], max_length=512, truncation=True, return_tensors='tf')
        # max_length is 20 because google search only takes up to 32 words in one search 
        pegasus_summary_id =  pegasus_model.generate(pegasus_input['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=5,
                                    max_length=29,
                                    early_stopping=True)
        pegasus_summary_ = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in pegasus_summary_id]
        return pegasus_summary_[0], month_yr
    except:
        print('article not found')

## Link Alternative Articles - combines output from short summary and alternative bias functions

In [21]:
def link_alternative_articles(original_url, title, date, alternative_bias, bias_data):
    """
    Gets the related alternative articles url links through google search
    
    input: original_url - string, url of the article that we are trying to get alternative articles for
           title - string, short summary of article,
           date - string, month and year,
           alternative_bias - list of string, the alternative sides of bias of the article,
           bias_data - dataframe - 2 columns, the source and the bias
    output: list of tuples, first element of the tuple is the url of the alternative bias covering the same topic
                            second element of the tuple is the source name
   """
    # google news way
#     articles = []
#     sources = []
#     googlenews = GoogleNews()
#     for bias in alternative_bias: 
#         source_list = bias_data[bias_data.bias == bias].source.tolist()
#         sample_sources = sample(source_list, 7)
#         for source in sample_sources:
#             query_list = [title, 'article', date, source]
#             source_url = bias_data[bias_data.source == source].website.iloc[0]
#             query = ' '.join(query_list)
#             googlenews.search(query)
#             news_links = googlenews.get_links()
#             print(news_links)
#             for article_url in news_links:
#                 if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
#                     continue
#                 elif source_url not in article_url:
#                     if fuzz.partial_ratio(source_url, article_url) > 80:
#                         articles.append(article_url)
#                         sources.append(source)
#                         googlenews.clear()
#                         break
#                     else:
#                             continue
#                 else:
#                     articles.append(article_url)
#                     sources.append(source)
#                     googlenews.clear()
#                     break
    # google search way        
    articles = []
    sources = []
    for bias in alternative_bias: 
        source_list = bias_data[bias_data.bias == bias].source.tolist()
        sample_sources = sample(source_list, 4)
        for source in sample_sources:
            query_list = [title, 'article', date, source]
            source_url = bias_data[bias_data.source == source].website.iloc[0]
            query = ' '.join(query_list)
            search_generator = search(query, num = 2, pause = 3)
            article_url = next(search_generator)
            if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
                article_url2 = next(search_generator)
                if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                    continue
                elif fuzz.partial_ratio(source_url, article_url2) > 80:
                    articles.append(article_url2)
                    sources.append(source)
            elif source_url not in article_url:
                if fuzz.partial_ratio(source_url, article_url) > 80:
                    articles.append(article_url)
                    sources.append(source)
                else:
                    article_url2 = next(search_generator)
                    if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                        continue
                    elif fuzz.partial_ratio(source_url, article_url2) > 80:
                        articles.append(article_url2)
                        sources.append(source)
                    else:
                        continue
            else:
                articles.append(article_url)
                sources.append(source)
    zipped_list = list(zip(articles,sources))
    return zipped_list

## Unpack Link of Alternative Articles

In [24]:
def url_to_info(zipped_urls_sources):
    """
    Convert the urls and sources to the text and the titles of the related articles
    
    input: list, urls of related articles
    output:list of tuples, first element of tuple is the article text
                           second element of tuple is the article title
                           third element of tuple is the source name
    """
    article_texts = []
    article_titles = []
    article_sources = []
    article_urls = []
    urls, sources = zip(*zipped_urls_sources)
    for index in range(len(urls)):
        try:
            article = Article(urls[index])
            article.download()
            article.parse()
            txt = article.text
            article.nlp()
            # if there is no text in the article it isn't included
            if txt:
                # if there is less than 35 words in the article, it isn't included
                if len(txt.split(' ')) < 35:
                    continue
                else:
                    article_urls.append(urls[index])
                    article_texts.append(txt)
                    article_titles.append(article.title)
                    article_sources.append(sources[index])
        except:
            continue
    zipped_articles = list(zip(article_texts, article_titles, article_sources, article_urls))
    return zipped_articles

## Find and Keep more similar articles

In [46]:
def similar_documents(articles):
    """"
    function to get similar documents in order to ensure we have the articles that have the same topic, event or issue
    
    input: list of tuples, article texts, titles, sources, urls
    output: list of tuples, first element of the tuple are article texts that have high similarity to one another 
                            second element of the tuple are the titles of the articles
    """
    texts, titles, sources, urls = zip(*articles)
    tfidf = TfidfVectorizer().fit_transform(texts)
    pairwise_similarity = tfidf * tfidf.T
    
    # for each document compute the average similarity score to the other documents
    # .53 is an arbitrary threshold
    # should be higher than .53 average to make sure that the documents talk about the same topic
    avg_similarity = np.average(pairwise_similarity.toarray(), axis = 1)
    bool_similarity = avg_similarity > 0.53
    # get the list of articles that fulfill the requirement of .55 avg similarity
    
    #if there are more than 4 articles that have greater than .45 similarities, only take the top 4 similarities 
    if sum(bool_similarity) > 4:
        top_indexes = avg_similarity.argsort()[-4:][::-1]
        updated_texts = [texts[i] for i in top_indexes]
        updated_titles = [titles[i] for i in top_indexes]
        updated_sources = [sources[i] for i in top_indexes]
        updated_urls = [urls[i] for i in top_indexes]
    elif sum(bool_similarity) == 0:
        #if there is no article that has a collective similarity score
        # print something
        # returns null
        print("No similar articles")
        return np.nan
    else:
        updated_texts = list(np.array(texts)[bool_similarity])
        updated_titles = list(np.array(titles)[bool_similarity])
        updated_sources = list(np.array(sources)[bool_similarity])
        updated_urls = list(np.array(urls)[bool_similarity])
    zipped_similar = list(zip(updated_texts, updated_titles, updated_sources, updated_urls))
    return zipped_similar

## Summarization of similar articles

In [13]:
# load t5 model for second level summarization
t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-large')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [14]:
def summarization(similar_articles_text):
    """ 
    Summarize the article texts using pegasus model(abstractive) on each text and then combine the summaries into a string and
    put it into the t5 model (extractive)
    
    input: tuple of similar article text
    output: string of the summary of similar articles
    """
    # summarize each article using pegasus
    pegasus_input_list = [pegasus_tokenizer([text], max_length=512, truncation=True, return_tensors='tf')
                          for text in similar_articles_text]
    
    pegasus_summary_ids =  [pegasus_model.generate(i['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=60,
                                    max_length=300,
                                    early_stopping=True) for i in pegasus_input_list]
    
    pegasus_summary_list = [[pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in i] for i in pegasus_summary_ids]
    
    # combine the pegasus summaries into a string
    pegasus_summaries = " ".join([i[0] for i in pegasus_summary_list])
    
    # get final summary through t5 model
    total_input_list = t5_tokenizer(["summarize: " + pegasus_summaries], truncation = True, return_tensors = 'tf')
    t5_id =  t5_model.generate(total_input_list['input_ids'],
                                    num_beams=6,
                                    no_repeat_ngram_size=5,
                                    min_length=50,
                                    max_length=300)
    t5_summary = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in t5_id]
    return t5_summary[0]

## Example Run

In [27]:
article_url = 'https://www.foxnews.com/world/us-army-special-ops-veterans-take-matters-into-their-own-hands-to-get-trusted-ally-out-of-afghanistan'

In [28]:
# from model output but right now manual output
alt_biases = get_alternative_bias('right')

In [30]:
# get summary and date for input into link alternative article functions
summary, date = short_summary(article_url)

no published date


In [31]:
summary

'A GoFundMe campaign has been launched to raise money for an Afghan special forces interpreter who has been unable to get a visa to come to the'

In [33]:
# google search results
link_alternative_articles(article_url, summary, date, alt_biases, data)

[('https://www.bloomberg.com/news/articles/2021-06-24/biden-plans-to-relocate-afghans-who-helped-u-s-military-in-war',
  'Bloomberg'),
 ('https://www.bbc.com/news/world-asia-56860781', 'BBC News'),
 ('https://apnews.com/article/government-and-politics-6dc242a6d170cfc419a09fb6ee0494db',
  'Associated Press'),
 ('https://www.wsj.com/articles/afghan-translators-will-await-admission-to-u-s-in-other-countries-officials-say-11625757547',
  'Wall Street Journal (News)')]

In [34]:
alt_articles_links = link_alternative_articles(article_url, summary, date, alt_biases, data)

In [35]:
alt_articles_links

[('https://www.npr.org/2021/06/19/1004991965/afghan-interpreters-who-await-visas-after-helping-the-u-s-now-fear-for-their-liv',
  'NPR (Opinion) '),
 ('https://www.wsj.com/articles/afghan-translators-will-await-admission-to-u-s-in-other-countries-officials-say-11625757547',
  'Wall Street Journal (News)'),
 ('https://www.bbc.com/news/world-asia-56860781', 'BBC News')]

In [36]:
# unpack urls
alt_articles = url_to_info(alt_articles_links)

In [37]:
# keep similar articles
zipped_similar = similar_documents(alt_articles)

In [38]:
texts, title, sources, urls= zip(*zipped_similar)

In [39]:
len(texts)

2

In [40]:
title

('Afghan Interpreters Who Await Visas After Helping The U.S. Now Fear For Their Lives',
 'Afghanistan War: How can the West fight terrorism after leaving?')

In [41]:
sources

('NPR (Opinion) ', 'BBC News')

In [42]:
urls

('https://www.npr.org/2021/06/19/1004991965/afghan-interpreters-who-await-visas-after-helping-the-u-s-now-fear-for-their-liv',
 'https://www.bbc.com/news/world-asia-56860781')

In [43]:
summary = summarization(texts)

In [44]:
summary

'as the u.s. prepares to withdraw its troops from Afghanistan next month, some Afghans who have worked for the united states say they fear for their lives . drone strikes against so-called Islamic state (IS) in Syria and Iraq have become a regular feature of military operations in the middle east . but their use has come under increasing scrutiny in recent months, particularly in the wake of the killing of british aid worker Alan Henning in a drone strike .'

In [None]:
title

In [None]:
sources

In [None]:
urls

In [205]:
articles, sources = link_alternative_articles('https://www.nytimes.com/live/2021/06/24/us/joe-biden-news','biden infrastructure bill', 'June 2021',['right', 'center'], data)

In [207]:
articles

['https://nypost.com/2021/06/27/romney-has-faith-biden-will-sign-infrastructure-bill/',
 'https://spectator.org/infrastructure-insanity-democrats/',
 'https://www.dailywire.com/news/biden-backtrack-president-now-says-he-will-not-veto-infrastructure-deal',
 'https://dailycaller.com/2021/06/29/joe-manchin-supports-democrat-only-joe-biden-infrastructure-bill/',
 'https://www.usatoday.com/story/news/politics/2021/06/24/biden-senators-agree-1-2-trillion-infrastructure-deal/5333841001/',
 'https://www.csmonitor.com/USA/Politics/2021/0629/Biden-wants-infrastructure.-Does-America-know-how-to-do-it-anymore',
 'https://www.forbes.com/sites/arielcohen/2021/07/06/what-the-bipartisan-infrastructure-plan-means-for-us-energy/',
 'https://www.wsj.com/articles/biden-needs-to-save-the-infrastructure-bill-11625608639']

In [206]:
sources

['New York Post (News)',
 'The American Spectator',
 'The Daily Wire',
 'The Daily Caller',
 'USA TODAY',
 'Christian Science Monitor',
 'Forbes',
 'Wall Street Journal (News)']

2 functions for summarization

1. summarization of article - input an article - output a pegasus sumarization of article - up to one sentence, month and year

2. summarization of similar articles - input urls of articles - pegasus or newspaper (extractive) of articles - get document similarity to make sure talking about the same subject/event/issue, then pegasus & T5 sumarize the summaries to get a general overview of the articles.