In [20]:
import numpy as np
import pandas as pd
from random import sample
import time

# for summarization
from transformers import TFXLNetForSequenceClassification, XLNetTokenizer, T5Tokenizer, TFT5ForConditionalGeneration, PegasusTokenizer, TFPegasusForConditionalGeneration
import datetime
from newspaper import Article, Config
from heapq import nlargest
# from GoogleNews import GoogleNews
from googlesearch import search

# for partial matching strings
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# for document similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein-wheels

In [3]:
# !pip install beautifulsoup4
# !pip install googlesearch-python
# !pip install newspaper3k
# !pip install GoogleNews

In [4]:
#!pip install --upgrade transformers  --user

## allsides webscraping to get bias

In [5]:
# put the news sources and biases in a dataframe: data
data = pd.read_csv('./bias_df.csv')

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,source,website,bias
0,0,ABC News (Online),https://abcnews.go.com,left
1,1,AlterNet,https://www.alternet.org,left
2,2,Associated Press,https://apnews.com,center
3,3,Axios,https://www.axios.com,center
4,4,BBC News,https://www.bbc.com,center


## Alternative Bias function

In [7]:
def get_alternative_bias(article_bias):
    """Gets the other biases from the article bias
    
       input: string, the bias of the article - options: left, center, right
       output: list, of the alternative biases. 
       eg. get_opposite_bias('right') returns ['left', 'center']"""
    biases = ['left', 'center', 'right']
    try:
        biases.remove(article_bias)
        
    except ValueError:
        # no bias, return list of just center
        biases = ['center']
    return biases

## Short Summary function

In [8]:
# pegasus model loaded
pegasus_model = TFPegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


In [101]:
def short_summary(article_url):
    """
    Creates a short summary of the article and gets the month and year article published
    for the search query for related articles
    
   input:  string, url of the article
   output: string, short summary of the article
           string, month and year that the article was published in
    """
    tic = time.perf_counter()
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        txt = article.text
        toc = time.perf_counter()
        print(f"Downloaded the article in {toc - tic:0.4f} seconds")

        try:
            pub_date = article.publish_date
            month_yr = pub_date.strftime("%B") + " " + str(pub_date.year)
        except:
            month_yr = ""
            print('no published date')
        tic = time.perf_counter()

        pegasus_input = pegasus_tokenizer([txt], max_length=512, truncation=True, return_tensors='tf')
        # max_length is 20 because google search only takes up to 32 words in one search 
        pegasus_summary_id =  pegasus_model.generate(pegasus_input['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=5,
                                    max_length=29,
                                    early_stopping=True)
        pegasus_summary_ = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in pegasus_summary_id]
        toc = time.perf_counter()

        print(f"Created the summary in {toc - tic:0.4f} seconds")

        return pegasus_summary_[0], month_yr
    except Exception as inst:
        print(type(inst))    # the exception instance
        print(inst.args)
        print('article not found')
        raise inst

## Link Alternative Articles - combines output from short summary and alternative bias functions

In [103]:
def link_alternative_articles(original_url, title, date, alternative_bias, bias_data):
    """
    Gets the related alternative articles url links through google search
    
    input: original_url - string, url of the article that we are trying to get alternative articles for
           title - string, short summary of article,
           date - string, month and year,
           alternative_bias - list of string, the alternative sides of bias of the article,
           bias_data - dataframe - 2 columns, the source and the bias
    output: list of tuples, first element of the tuple is the url of the alternative bias covering the same topic
                            second element of the tuple is the source name
   """
    # google news way
#     articles = []
#     sources = []
#     googlenews = GoogleNews()
#     for bias in alternative_bias: 
#         source_list = bias_data[bias_data.bias == bias].source.tolist()
#         sample_sources = sample(source_list, 7)
#         for source in sample_sources:
#             query_list = [title, 'article', date, source]
#             source_url = bias_data[bias_data.source == source].website.iloc[0]
#             query = ' '.join(query_list)
#             googlenews.search(query)
#             news_links = googlenews.get_links()
#             print(news_links)
#             for article_url in news_links:
#                 if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
#                     continue
#                 elif source_url not in article_url:
#                     if fuzz.partial_ratio(source_url, article_url) > 80:
#                         articles.append(article_url)
#                         sources.append(source)
#                         googlenews.clear()
#                         break
#                     else:
#                             continue
#                 else:
#                     articles.append(article_url)
#                     sources.append(source)
#                     googlenews.clear()
#                     break
    # google search way        
    articles = []
    sources = []
    for bias in alternative_bias: 
        source_list = bias_data[bias_data.bias == bias].source.tolist()
        sample_sources = sample(source_list, 5)
        for source in sample_sources:
            print(source)
            query_list = [title, 'article', date, source]
            source_url = bias_data[bias_data.source == source].website.iloc[0]
            query = ' '.join(query_list)
            search_generator = search(query, num = 2, pause = 3)
            article_url = next(search_generator)
            if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
                article_url2 = next(search_generator)
                if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                    continue
                elif fuzz.partial_ratio(source_url, article_url2) > 80:
                    articles.append(article_url2)
                    sources.append(source)
            elif source_url not in article_url:
                if fuzz.partial_ratio(source_url, article_url) > 80:
                    articles.append(article_url)
                    sources.append(source)
                else:
                    article_url2 = next(search_generator)
                    if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                        continue
                    elif fuzz.partial_ratio(source_url, article_url2) > 80:
                        articles.append(article_url2)
                        sources.append(source)
                    else:
                        continue
            else:
                articles.append(article_url)
                sources.append(source)
    zipped_list = list(zip(articles,sources))
    return zipped_list

## Unpack Link of Alternative Articles

In [104]:
def url_to_info(zipped_urls_sources):
    """
    Convert the urls and sources to the text and the titles of the related articles
    
    input: list, urls of related articles
    output:list of tuples, first element of tuple is the article text
                           second element of tuple is the article title
                           third element of tuple is the source name
    """
    article_texts = []
    article_titles = []
    article_sources = []
    article_urls = []
    urls, sources = zip(*zipped_urls_sources)
    for index in range(len(urls)):
        try:
            article = Article(urls[index])
            article.download()
            article.parse()
            txt = article.text
            article.nlp()
            # if there is no text in the article it isn't included
            if txt:
                # if there is less than 35 words in the article, it isn't included
                if len(txt.split(' ')) < 35:
                    continue
                else:
                    article_urls.append(urls[index])
                    article_texts.append(txt)
                    article_titles.append(article.title)
                    article_sources.append(sources[index])
        except:
            continue
    zipped_articles = list(zip(article_texts, article_titles, article_sources, article_urls))
    return zipped_articles

## Find and Keep more similar articles

In [118]:
def similar_documents(articles):
    """"
    function to get similar documents in order to ensure we have the articles that have the same topic, event or issue
    
    input: list of tuples, article texts, titles, sources, urls
    output: list of tuples, first element of the tuple are article texts that have high similarity to one another 
                            second element of the tuple are the titles of the articles
    """
    texts, titles, sources, urls = zip(*articles)
    tfidf = TfidfVectorizer().fit_transform(texts)
    pairwise_similarity = tfidf * tfidf.T
    
    # for each document compute the average similarity score to the other documents
    # .53 is an arbitrary threshold
    # should be higher than .53 average to make sure that the documents talk about the same topic
    avg_similarity = np.average(pairwise_similarity.toarray(), axis = 1)
    bool_similarity = avg_similarity > 0.53
    # get the list of articles that fulfill the requirement of .53 avg similarity
    
    #if there are more than 4 articles that have greater than .53 similarities, only take the top 4 similarities 
    if sum(bool_similarity) > 4:
        top_indexes = avg_similarity.argsort()[-4:][::-1]
        updated_texts = [texts[i] for i in top_indexes]
        updated_titles = [titles[i] for i in top_indexes]
        updated_sources = [sources[i] for i in top_indexes]
        updated_urls = [urls[i] for i in top_indexes]
    elif sum(bool_similarity) <=1:
        #if there is less than 2 articles that has a collective similarity score over .53 
        raise IndexError('No similar articles found')
    else:
        updated_texts = list(np.array(texts)[bool_similarity])
        updated_titles = list(np.array(titles)[bool_similarity])
        updated_sources = list(np.array(sources)[bool_similarity])
        updated_urls = list(np.array(urls)[bool_similarity])
    zipped_similar = list(zip(updated_texts, updated_titles, updated_sources, updated_urls))
    return zipped_similar

## Summarization of similar articles

In [13]:
# load t5 model for second level summarization
t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-large')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [140]:
def summarization(similar_articles_text):
    """ 
    Summarize the article texts using pegasus model(abstractive) on each text and then combine the summaries into a string and
    put it into the t5 model (extractive)
    
    input: tuple of similar article text
    output: string of the summary of similar articles
    """
    # summarize each article using pegasus
    tic = time.perf_counter()
    pegasus_input_list = [pegasus_tokenizer([text], max_length=512, truncation=True, return_tensors='tf')
                          for text in similar_articles_text]
    toc = time.perf_counter()
    print(f"Got the pegasus input list in {toc - tic:0.4f} seconds")

    pegasus_summary_ids =  [pegasus_model.generate(i['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=60,
                                    max_length=300,
                                    early_stopping=True) for i in pegasus_input_list]
    tic = time.perf_counter()

    print(f"Got the summary ids in {tic - toc:0.4f} seconds")


    pegasus_summary_list = [[pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in i] for i in pegasus_summary_ids]
    toc = time.perf_counter()

    print(f"Got the summary ids in {toc - tic:0.4f} seconds")
    print(pegasus_summary_list)

    # combine the pegasus summaries into a string
    pegasus_summaries = " ".join([i[0] for i in pegasus_summary_list])
    
    # pegasus summary for second round code
#     pegasus_input2 = pegasus_tokenizer([pegasus_summaries], max_length=512, truncation=True, return_tensors='tf')
#     tic = time.perf_counter()
#     print(f"Got the inputs for final pegasus run in {tic - toc:0.4f} seconds")

#     pegasus_summary_id2 =  pegasus_model.generate(pegasus_input2['input_ids'], 
#                                     no_repeat_ngram_size=5,
#                                     min_length=60,
#                                     max_length=300,
#                                     early_stopping=True)
#     toc = time.perf_counter()

#     print(f"Got the summary ids for pegasus2 in {toc - tic:0.4f} seconds")
#     pegasus_summary_2 = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
#                            clean_up_tokenization_spaces=False) for g in pegasus_summary_id2]

#     tic = time.perf_counter()
#     print(f"Got the final pegasus summary in {tic - toc:0.4f} seconds")
#     return pegasus_summary_2[0]

    # get final summary through t5 model
    total_input_list = t5_tokenizer(["summarize: " + pegasus_summaries], truncation = True, return_tensors = 'tf')
    tic = time.perf_counter()
    print(f"Got the input list for t5 in {tic - toc:0.4f} seconds")
    t5_id =  t5_model.generate(total_input_list['input_ids'],
                                    num_beams=6,
                                    no_repeat_ngram_size=5,
                                    min_length=50,
                                    max_length=300)
    toc = time.perf_counter()
    print(f"Got the t5 id in {toc - tic:0.4f} seconds")

    t5_summary = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in t5_id]
    tic = time.perf_counter()
    print(f"Got the summary for t5 in {tic - toc:0.4f} seconds")

    return t5_summary[0]

## Example Run

In [125]:
article_url = 'https://www.nytimes.com/2021/03/16/us/politics/election-interference-russia-2020-assessment.html'

In [126]:
# from model output but right now manual output
alt_biases = get_alternative_bias('right')

In [127]:
# get summary and date for input into link alternative article functions
summary, date = short_summary(article_url)

Downloaded the article in 0.5357 seconds
Created the summary in 52.3849 seconds


In [128]:
summary

'The US intelligence community has declassified a report on Russia’s efforts to meddle in the 2016 election, laying out how former Vice'

In [129]:
alt_articles_links = link_alternative_articles(article_url, summary, date, alt_biases, data)

HuffPost
Bloomberg
Time Magazine
Democracy Now
The New Yorker
Christian Science Monitor
Axios
NPR (Online News)
Associated Press
Reuters


In [130]:
alt_articles_links

[('https://www.huffpost.com/news/topic/election-interference', 'HuffPost'),
 ('https://www.bloomberg.com/news/articles/2021-03-16/u-s-spy-agency-rejects-trump-claim-of-china-election-meddling',
  'Bloomberg'),
 ('https://www.newyorker.com/news/news-desk/russias-election-meddling-is-another-american-intelligence-failure',
  'The New Yorker'),
 ('https://www.csmonitor.com/USA/Politics/2019/0514/Amid-growing-concerns-about-2020-a-primer-on-Russian-election-interference',
  'Christian Science Monitor'),
 ('https://www.axios.com/senate-gop-house-republicans-russia-meddling-e4f46fc6-3ec0-49c4-b719-0efc79a0a558.html',
  'Axios'),
 ('https://www.npr.org/2020/08/18/903616280/sen-mark-warner-discusses-latest-report-on-russias-influence-on-2016-election',
  'NPR (Online News)'),
 ('https://apnews.com/article/fact-checking-afs:Content:9458967050',
  'Associated Press'),
 ('https://www.reuters.com/article/us-usa-trump-russia-q-a/what-we-know-about-u-s-probes-of-russian-meddling-in-2016-election-idU

In [131]:
# unpack urls
alt_articles = url_to_info(alt_articles_links)

In [132]:
# keep similar articles
zipped_similar = similar_documents(alt_articles)

In [133]:
texts, title, sources, urls= zip(*zipped_similar)

In [134]:
len(texts)

4

In [135]:
title

('Russia’s Election Meddling Was Another U.S. Intelligence Failure',
 'Amid growing concerns about 2020, a primer on Russian election interference',
 'What we know about U.S. probes of Russian meddling in 2016 election',
 'Intel in letter is unverified, doesn’t show Clinton planned ‘Russia hoax’')

In [136]:
sources

('The New Yorker', 'Christian Science Monitor', 'Reuters', 'Associated Press')

In [137]:
urls

('https://www.newyorker.com/news/news-desk/russias-election-meddling-is-another-american-intelligence-failure',
 'https://www.csmonitor.com/USA/Politics/2019/0514/Amid-growing-concerns-about-2020-a-primer-on-Russian-election-interference',
 'https://www.reuters.com/article/us-usa-trump-russia-q-a/what-we-know-about-u-s-probes-of-russian-meddling-in-2016-election-idUSKBN19604O',
 'https://apnews.com/article/fact-checking-afs:Content:9458967050')

In [141]:
summary = summarization(texts)

Got the pegasus input list in 0.0490 seconds
Got the summary ids in 416.9655 seconds
Got the summary ids in 0.0732 seconds
[['For the first time since the Cold War, the United States has publicly admitted that Russia meddled in the 2016 presidential election in order to help Donald Trump and hurt his Democratic opponent, Hillary Clinton, by damaging their election and undermining public faith in the American political system, according to a new report.'], ['With new details from the Mueller report about Russia’s interference in the 2016 presidential election, an overview of what we know about what happened last time around and what the United States is doing to prevent another such attack, here’s a new sense of urgency around the question of what the U.S.'], ['Here is a guide to the controversy surrounding alleged Russian meddling in the 2016 presidential election and possible ties between President Donald Trump’s campaign team and Moscow, as well as the various investigations into the

In [142]:
summary

'the u.s. has publicly admitted that russia meddled in the 2016 presidential election . for the first time since the cold war, the united states has publicly admitted this, a new report says . here is a guide to the controversy surrounding alleged Russian meddling in the 2016 election .'

2 functions for summarization

1. summarization of article - input an article - output a pegasus sumarization of article - up to one sentence, month and year

2. summarization of similar articles - input urls of articles - pegasus or newspaper (extractive) of articles - get document similarity to make sure talking about the same subject/event/issue, then pegasus & T5 sumarize the summaries to get a general overview of the articles.

In [122]:
        article = Article(article_url)
        article.download()
        article.parse()
        txt = article.text

ArticleException: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/articles/mcconnell-threatens-pileup-if-democrats-change-filibuster-rules-11615908931 on URL https://www.wsj.com/articles/mcconnell-threatens-pileup-if-democrats-change-filibuster-rules-11615908931

In [52]:
txt

'Speculation that President Joe Biden and the Democratic National Committee are petitioning cellphone carriers to monitor and edit private text messages is false, DNC and White House officials tell the Washington Examiner.\n\nStill, political operatives from both parties have made a practice of monitoring mass text blasts from the other side in recent years as SMS marketing has exploded as a tool for political campaigns, fundraising groups, and even private sector businesses.\n\nThe notion of new Democratic-backed SMS surveillance popped up in conservative circles after Politico published an article Monday that included a line that the DNC and other Biden-allied groups were "planning to engage fact-checkers more aggressively and work with SMS carriers to dispel misinformation about vaccines that is sent over social media and text messages."\n\n"The goal is to ensure that people who may have difficulty getting a vaccination because of issues like transportation see those barriers lessen

In [None]:
    tic = time.perf_counter()
    toc = time.perf_counter()
    print(f"Downloaded the tutorial in {toc - tic:0.4f} seconds")