In [1]:
import numpy as np
import pandas as pd
from random import sample
import time
import re

# for summarization
from gensim.summarization.summarizer import summarize
from transformers import TFXLNetForSequenceClassification, XLNetTokenizer, T5Tokenizer, TFT5ForConditionalGeneration, PegasusTokenizer, TFPegasusForConditionalGeneration
import datetime
from newspaper import Article, Config
from heapq import nlargest
# from GoogleNews import GoogleNews
from googlesearch import search

# for partial matching strings
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# for document similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein-wheels

In [3]:
# !pip install beautifulsoup4
# !pip install google
# !pip install newspaper3k
# !pip install GoogleNews

In [4]:
#!pip install --upgrade transformers  --user

In [5]:
# import transformers
# transformers.__version__

## allsides webscraping to get bias

In [6]:
# put the news sources and biases in a dataframe: data
data = pd.read_csv('./bias_df.csv')

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,source,website,bias
0,0,ABC News (Online),https://abcnews.go.com,left
1,1,AlterNet,https://www.alternet.org,left
2,2,Associated Press,https://apnews.com,center
3,3,Axios,https://www.axios.com,center
4,4,BBC News,https://www.bbc.com,center


## Alternative Bias function

In [8]:
def get_alternative_bias(article_bias):
    """Gets the other biases from the article bias
    
       input: string, the bias of the article - options: left, center, right
       output: list, of the alternative biases. 
       eg. get_opposite_bias('right') returns ['left', 'center']"""
    biases = ['left', 'center', 'right']
    try:
        biases.remove(article_bias)
        
    except ValueError:
        # no bias, return list of just center
        biases = ['center']
    return biases

## Short Summary function

In [9]:
# pegasus model loaded
pegasus_model = TFPegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


In [10]:
def short_summary(article_url):
    """
    Creates a short summary of the article and gets the month and year article published
    for the search query for related articles
    
   input:  string, url of the article
   output: string, short summary of the article
           string, month and year that the article was published in
    """
    tic = time.perf_counter()
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        txt = article.text
        toc = time.perf_counter()
        print(f"Downloaded the article in {toc - tic:0.4f} seconds")

        try:
            pub_date = article.publish_date
            month_yr = pub_date.strftime("%B") + " " + str(pub_date.year)
        except:
            month_yr = ""
            print('no published date')
            
        # if the text is less than 120 words raise an error
        if len(txt.split(' ')) < 120:
#             print(txt)
            raise ValueError('Not enough text in document')
        tic = time.perf_counter()
        txt = summarize(txt)
        print(txt)

        
        #txt = " ".join(txt.split(" ")[:400])
        pegasus_input = pegasus_tokenizer([txt], max_length=512, truncation=True, return_tensors='tf')
        # max_length is 20 because google search only takes up to 32 words in one search 
        pegasus_summary_id =  pegasus_model.generate(pegasus_input['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=5,
                                    max_length=29,
                                    early_stopping=True)
        pegasus_summary_ = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in pegasus_summary_id]
        toc = time.perf_counter()

        print(f"Created the summary in {toc - tic:0.4f} seconds")

        return pegasus_summary_[0], month_yr
#         return summary, month_yr
    except Exception as inst:
        raise inst

## Link Alternative Articles - combines output from short summary and alternative bias functions

In [11]:
def link_alternative_articles(original_url, title, date, alternative_bias, bias_data):
    """
    Gets the related alternative articles url links through google search
    
    input: original_url - string, url of the article that we are trying to get alternative articles for
           title - string, short summary of article,
           date - string, month and year,
           alternative_bias - list of string, the alternative sides of bias of the article,
           bias_data - dataframe - 2 columns, the source and the bias
    output: list of tuples, first element of the tuple is the url of the alternative bias covering the same topic
                            second element of the tuple is the source name
   """
    # google news way
#     articles = []
#     sources = []
#     googlenews = GoogleNews()
#     for bias in alternative_bias: 
#         source_list = bias_data[bias_data.bias == bias].source.tolist()
#         sample_sources = sample(source_list, 7)
#         for source in sample_sources:
#             query_list = [title, 'article', date, source]
#             source_url = bias_data[bias_data.source == source].website.iloc[0]
#             query = ' '.join(query_list)
#             googlenews.search(query)
#             news_links = googlenews.get_links()
#             print(news_links)
#             for article_url in news_links:
#                 if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
#                     continue
#                 elif source_url not in article_url:
#                     if fuzz.partial_ratio(source_url, article_url) > 80:
#                         articles.append(article_url)
#                         sources.append(source)
#                         googlenews.clear()
#                         break
#                     else:
#                             continue
#                 else:
#                     articles.append(article_url)
#                     sources.append(source)
#                     googlenews.clear()
#                     break
    # google search way        
    articles = []
    sources = []
    biases = []
    alternative_bias.sort()
    for bias in alternative_bias: 
        source_list = bias_data[bias_data.bias == bias].source.tolist()
        sample_sources = sample(source_list, 5)
        for source in sample_sources:
            print(source)
            query_list = [title, 'article', date, source]
            source_url = bias_data[bias_data.source == source].website.iloc[0]
            query = ' '.join(query_list)
            search_generator = search(query, num = 2, pause = 3)
            article_url = next(search_generator)
            if article_url in articles or fuzz.partial_ratio(original_url, article_url) > 80:
                article_url2 = next(search_generator)
                if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                    continue
                elif fuzz.partial_ratio(source_url, article_url2) > 80:
                    articles.append(article_url2)
                    sources.append(source)
                    biases.append(bias)
            elif source_url not in article_url:
                if fuzz.partial_ratio(source_url, article_url) > 80:
                    articles.append(article_url)
                    sources.append(source)
                else:
                    article_url2 = next(search_generator)
                    if article_url2 in articles or fuzz.partial_ratio(original_url, article_url2) > 80:
                        continue
                    elif fuzz.partial_ratio(source_url, article_url2) > 80:
                        articles.append(article_url2)
                        sources.append(source)
                        biases.append(bias)
                    else:
                        continue
            else:
                articles.append(article_url)
                sources.append(source)
                biases.append(bias)

    zipped_list = list(zip(articles,sources,biases))
    if len(zipped_list) == 0:
        raise ValueError('No alternative articles found')
    return zipped_list

## Unpack Link of Alternative Articles

In [12]:
def url_to_info(zipped_urls_sources):
    """
    Convert the urls and sources to the text and the titles of the related articles
    
    input: list, urls of related articles
    output:list of tuples, first element of tuple is the article text
                           second element of tuple is the article title
                           third element of tuple is the source name,
                           fourth element of tuple is the source bias
    """
    article_texts = []
    article_titles = []
    article_sources = []
    article_urls = []
    article_bias = []
    urls, sources, biases = zip(*zipped_urls_sources)
    for index in range(len(urls)):
        try:
            article = Article(urls[index])
            article.download()
            article.parse()
            txt = article.text
            article.nlp()
            # if there is no text in the article it isn't included
            if txt:
                # if there is less than 100 words in the article, it isn't included
                if len(txt.split(' ')) < 100:
                    print(index)
                    print(txt)
                    continue
                else:
                    article_urls.append(urls[index])
                    article_texts.append(txt)
                    article_titles.append(article.title)
                    article_sources.append(sources[index])
                    article_bias.append(biases[index])
        except:
            continue
    zipped_articles = list(zip(article_texts, article_titles, article_sources, article_urls, article_bias))
    return zipped_articles

## Find and Keep more similar articles

In [36]:
def list_comprehension_indexed(lst, indexes):
    """
    helper function to subset list by indexes
    
    input: 2 list, 1 list is the list to subset, other list is the indexes to subset by
    output: list, subset of the list by the indexes
    """
    updated_list = [lst[i] for i in indexes]
    return updated_list


def boolean_indexed(lst, boolean):
    """
    helper function to sebset list by boolean list
    
    input: 2 list of the same length, 1 list is the list to subset, other list is the boolean list to subset other list
    output: list, subset of the list by the boolean list
    """
    updated_list = list(np.array(lst)[boolean])
    return updated_list

In [37]:
def similar_documents(articles):
    """"
    function to get similar documents in order to ensure we have the articles that have the same topic, event or issue
    
    input: list of tuples, article texts, titles, sources, urls
    output: list of tuples, first element of the tuple are article texts that have high similarity to one another 
                            second element of the tuple are the titles of the articles,
                            third element of tuple is the sources of the related articles,
                            fourth element is the url to the related articles,
                            fifth element is the source biases of the related articles
    """
    texts, titles, sources, urls, biases = zip(*articles)
    tfidf = TfidfVectorizer().fit_transform(texts)
    pairwise_similarity = tfidf * tfidf.T
    
    # for each document compute the average similarity score to the other documents
    # .53 is an arbitrary threshold
    # should be higher than .53 average to make sure that the documents talk about the same topic
    avg_similarity = np.average(pairwise_similarity.toarray(), axis = 1)
    bool_similarity = avg_similarity > 0.53
    # get the list of articles that fulfill the requirement of .53 avg similarity

    #if there are more than 4 articles that have greater than .53 similarities, only take the top 4 similarities 
    if sum(bool_similarity) > 4:
        top_indexes = avg_similarity.argsort()[-4:][::-1]
        updated_texts = list_comprehension_indexed(texts, top_indexes)
        updated_titles = list_comprehension_indexed(titles, top_indexes) 
        updated_sources = list_comprehension_indexed(sources, top_indexes) 
        updated_urls = list_comprehension_indexed(urls, top_indexes)
        updated_bias = list_comprehension_indexed(biases, top_indexes)
    elif sum(bool_similarity) <=1:
        #if there is less than 2 articles that has a collective similarity score over .53 
        raise ValueError('No similar articles found')
    else:
        updated_texts = boolean_indexed(texts, bool_similarity)
        updated_titles = boolean_indexed(titles, bool_similarity)
        updated_sources = boolean_indexed(sources, bool_similarity)
        updated_urls = boolean_indexed(urls, bool_similarity)
        updated_bias = boolean_indexed(biases, bool_similarity)
    zipped_similar = list(zip(updated_texts, updated_titles, updated_sources, updated_urls, updated_bias))
    return zipped_similar

## Summarization of similar articles

In [15]:
def summarization(similar_articles_text):
    """ 
    Summarize the article texts using pegasus model(abstractive) on each text and then combine the summaries into a string and
    put it into the t5 model (extractive)
    
    input: tuple of similar article text
    output: string of the summary of similar articles
    """
    # summarize each article using pegasus
    tic = time.perf_counter()
    
    # summarize the first 400 words from each article 
    # and then combine all these similar articles 
    # and then do pegasus on it
    texts = [summarize(" ".join(article.split(" ")[:400])) for article in similar_articles_text]
    combined_txt = " ".join(texts)
    pegasus_input_list = pegasus_tokenizer([combined_txt], truncation=True, return_tensors='tf')
    pegasus_summary_ids = pegasus_model.generate(pegasus_input_list['input_ids'], 
                                    no_repeat_ngram_size=5,
                                    min_length=60,
                                    max_length=300,
                                    early_stopping=True)
    pegasus_summary_list = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
                           clean_up_tokenization_spaces=False) for g in pegasus_summary_ids]
    toc = time.perf_counter()
    print(f"Got the pegasus summary in {toc - tic:0.4f} seconds")
    return pegasus_summary_list
#     pegasus_input_list = [pegasus_tokenizer([text], max_length=512, truncation=True, return_tensors='tf')
#                           for text in similar_articles_text]
#     toc = time.perf_counter()
#     print(f"Got the pegasus input list in {toc - tic:0.4f} seconds")

#     pegasus_summary_ids =  [pegasus_model.generate(i['input_ids'], 
#                                     no_repeat_ngram_size=5,
#                                     min_length=60,
#                                     max_length=300,
#                                     early_stopping=True) for i in pegasus_input_list]
#     tic = time.perf_counter()

#     print(f"Got the summary ids in {tic - toc:0.4f} seconds")


#     pegasus_summary_list = [[pegasus_tokenizer.decode(g, skip_special_tokens=True, 
#                            clean_up_tokenization_spaces=False) for g in i] for i in pegasus_summary_ids]
#     toc = time.perf_counter()

#     print(f"Got the summary ids in {toc - tic:0.4f} seconds")
#     print(pegasus_summary_list)
    
#     # if returning 
    
# #     return pegasus_summary_list

#     # combine the pegasus summaries into a string
#     pegasus_summaries = " ".join([i[0] for i in pegasus_summary_list])
    
#     # pegasus summary for second round code
# #     pegasus_input2 = pegasus_tokenizer([pegasus_summaries], max_length=512, truncation=True, return_tensors='tf')
# #     tic = time.perf_counter()
# #     print(f"Got the inputs for final pegasus run in {tic - toc:0.4f} seconds")

# #     pegasus_summary_id2 =  pegasus_model.generate(pegasus_input2['input_ids'], 
# #                                     no_repeat_ngram_size=5,
# #                                     min_length=60,
# #                                     max_length=300,
# #                                     early_stopping=True)
# #     toc = time.perf_counter()

# #     print(f"Got the summary ids for pegasus2 in {toc - tic:0.4f} seconds")
# #     pegasus_summary_2 = [pegasus_tokenizer.decode(g, skip_special_tokens=True, 
# #                            clean_up_tokenization_spaces=False) for g in pegasus_summary_id2]

# #     tic = time.perf_counter()
# #     print(f"Got the final pegasus summary in {tic - toc:0.4f} seconds")
# #     return pegasus_summary_2[0]

#     # get final summary through t5 model
#     total_input_list = t5_tokenizer(["summarize: " + pegasus_summaries], truncation = True, return_tensors = 'tf')
#     tic = time.perf_counter()
#     print(f"Got the input list for t5 in {tic - toc:0.4f} seconds")
#     t5_id =  t5_model.generate(total_input_list['input_ids'],
#                                     num_beams=6,
#                                     no_repeat_ngram_size=5,
#                                     min_length=50,
#                                     max_length=300)
#     toc = time.perf_counter()
#     print(f"Got the t5 id in {toc - tic:0.4f} seconds")

#     t5_summary = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in t5_id]
#     tic = time.perf_counter()
#     print(f"Got the summary for t5 in {tic - toc:0.4f} seconds")

#     return t5_summary[0]

## Example Run

In [16]:
article_url = 'https://www.breitbart.com/tech/2020/11/09/twitters-censorship-of-president-trump-continues-to-escalate/'

In [17]:
article_url

'https://www.breitbart.com/tech/2020/11/09/twitters-censorship-of-president-trump-continues-to-escalate/'

In [18]:
# from model output but right now manual output
alt_biases = get_alternative_bias('right')

In [19]:
# get summary and date for input into link alternative article functions
summary, date = short_summary(article_url)

Downloaded the article in 1.0559 seconds
Amid the most disputed election results in recent history, Twitter continues to censor the President of the United States, Donald J.
As of November 5, Twitter had censored half of President Trump’s tweets since election day.
“People were screaming STOP THE COUNT & WE DEMAND TRANSPARENCY (As Legal Observers were refused admittance to count rooms)!” tweeted Trump earlier today, responding to a tweet from Rep. Jim Jordan celebrating a legal win for Republicans and the President in the Pennsylvania vote count.
“Tens of thousands of votes were illegally received after 8 P.M. on Tuesday, Election Day, totally and easily changing the results in Pennsylvania and certain other razor thin states.
Tens of thousands of votes were illegally received after 8 P.M. on Tuesday, Election Day, totally and easily changing the results in Pennsylvania and certain other razor thin states.
As a separate matter, hundreds of thousands of Votes were illegally not allowed 

In [20]:
summary

'President-elect Donald Trump has taken to Twitter to complain about the counting of votes in Pennsylvania.'

In [21]:
alt_articles_links = link_alternative_articles(article_url, summary, date, alt_biases, data)

Axios
Christian Science Monitor
NPR (Online News)
Forbes
The Hill
CNN (Online News)
HuffPost
AlterNet
The Guardian
New York Times (News)


In [22]:
alt_articles_links

[('https://www.npr.org/2020/11/05/931699984/trump-launches-broad-legal-gambit-paired-with-public-doubt-raising-on-results',
  'NPR (Online News)',
  'center'),
 ('https://www.cnn.com/2020/11/03/politics/donald-trump-joe-biden-us-election-analysis/index.html',
  'CNN (Online News)',
  'left'),
 ('https://www.theguardian.com/us-news/2020/nov/10/trumps-vote-claims-go-viral-on-social-media-despite-curbs',
  'The Guardian',
  'left')]

In [23]:
# unpack urls
alt_articles = url_to_info(alt_articles_links)

In [24]:
alt_articles

[('Trump Launches Broad Legal Gambit Paired With Moves To Raise Public Doubts On Results\n\nEnlarge this image toggle caption Seth Herald/AFP via Getty Images Seth Herald/AFP via Getty Images\n\nUpdated at 10:17 p.m. ET\n\nPresident Trump\'s campaign has unleashed a multipronged legal offensive directed at states where vote counting continued Thursday based on unsupported allegations about fraud and irregularities in the election.\n\nAttorneys for the Trump campaign sought intervention from the U.S. Supreme Court and also filed suit in Pennsylvania, Georgia and Nevada seeking remedies they hoped would help their prospects in those places. In some instances, that included requests for counting to cease altogether or at least pause for a time.\n\nIn Wisconsin, the campaign said Wednesday it would request a recount after unofficial tallies showed Democrat Joe Biden leading Trump by about 20,000 votes. The Trump campaign alleged, with little evidence, irregularities in the vote. A recount 

In [38]:
# keep similar articles
zipped_similar = similar_documents(alt_articles)

In [39]:
texts, title, sources, urls, biases= zip(*zipped_similar)

In [42]:
title

('Trump Launches Broad Legal Gambit Paired With Moves To Raise Public Doubts On Results',
 "Trump's vote fraud claims go viral on social media despite curbs")

In [43]:
sources

('NPR (Online News)', 'CNN (Online News)', 'The Guardian')

In [44]:
urls

('https://www.npr.org/2020/11/05/931699984/trump-launches-broad-legal-gambit-paired-with-public-doubt-raising-on-results',
 'https://www.cnn.com/2020/11/03/politics/donald-trump-joe-biden-us-election-analysis/index.html',
 'https://www.theguardian.com/us-news/2020/nov/10/trumps-vote-claims-go-viral-on-social-media-despite-curbs')

In [45]:
biases

('center', 'left', 'left')

In [46]:
summary_all = summarization(texts)

Got the pegasus summary in 67.2256 seconds


In [47]:
summary_all

["US President Donald Trump, Vice-President Joe Biden and other political figures are making a concerted effort to spread misinformation about the US presidential election in a bid to undermine the legitimacy of the vote. Major social media platforms are cracking down on misinformation prominently displayed on election results in a bid to raise doubts about the validity of Donald Trump's vote."]

2 functions for summarization

1. summarization of article - input an article - output a pegasus sumarization of article - up to one sentence, month and year

2. summarization of similar articles - input urls of articles - pegasus or newspaper (extractive) of articles - get document similarity to make sure talking about the same subject/event/issue, then pegasus & T5 sumarize the summaries to get a general overview of the articles.

In [242]:
article = Article(article_url)
article.download()
article.parse()
txt = article.text

In [244]:
txt

'NEW You can now listen to Fox News articles!\n\nStatement from Tucker Carlson: "As we reported last week, dead Americans voted in this election. We shared a few examples. But on Friday, we began to learn some of the specific dead voters reported to us as deceased are in fact alive. We initially corrected this on Friday. We regret not catching it earlier. But the truth remains: dead people voted in the election."\n\nIt\'s been more than a week since the final votes were cast and many of Donald Trump\'s 72 million voters still believe this election was fundamentally unfair. They\'re right about that. Democrats completely changed the way we voted in this election. Our system has never been more disorganized and it\'s never been more vulnerable to manipulation.\n\nSo was there voter fraud last week? We\'ve been working on that question ever since Election Night. We\'ve tried to be careful and precise as we report this out. In moments like this, truth matters more than ever. False allegati