In [3]:
links = {
    'webpage-1': set(['webpage-2', 'webpage-4', 'webpage-5', 'webpage-6', 'webpage-8', 'webpage-9', 'webpage-10']),
    'webpage-2': set(['webpage-5', 'webpage-6']),
    'webpage-3': set(['webpage-10']),
    'webpage-4': set(['webpage-9']),
    'webpage-5': set(['webpage-2', 'webpage-4']),
    'webpage-6': set([]), # dangling page
    'webpage-7': set(['webpage-1', 'webpage-3', 'webpage-4']),
    'webpage-8': set(['webpage-1']),
    'webpage-9': set(['webpage-1', 'webpage-2', 'webpage-3', 'webpage-8', 'webpage-10']),
    'webpage-10': set(['webpage-2', 'webpage-3', 'webpage-8', 'webpage-9']),
}

In [4]:
def build_index(links):
    website_list = links.keys()
    return {website : index for (index, website) in enumerate(website_list)}

In [5]:
website_index = build_index(links)
print(website_index)

{'webpage-2': 1, 'webpage-7': 2, 'webpage-5': 7, 'webpage-4': 3, 'webpage-9': 4, 'webpage-1': 5, 'webpage-10': 6, 'webpage-6': 0, 'webpage-3': 8, 'webpage-8': 9}


In [6]:
import numpy as np

In [7]:
def build_transition_matrix(links, index):
    total_links = 0
    A = np.zeros((len(index), len(index)))
    for webpage in links:
        # dangling page
        if not links[webpage]:
            # assign equal prob to transition to all other pages
            A[index[webpage]] = np.ones(len(index)) / len(index)
        else:
            for dest_webpage in links[webpage]:
                total_links += 1
                A[index[webpage]][index[dest_webpage]] = 1.0 / len(links[webpage])
    return A

In [8]:
A = build_transition_matrix(links, website_index)

In [9]:
for i in links:
    print(i)

webpage-6
webpage-2
webpage-7
webpage-4
webpage-9
webpage-1
webpage-10
webpage-5
webpage-3
webpage-8


In [10]:
links

{'webpage-1': {'webpage-10',
  'webpage-2',
  'webpage-4',
  'webpage-5',
  'webpage-6',
  'webpage-8',
  'webpage-9'},
 'webpage-10': {'webpage-2', 'webpage-3', 'webpage-8', 'webpage-9'},
 'webpage-2': {'webpage-5', 'webpage-6'},
 'webpage-3': {'webpage-10'},
 'webpage-4': {'webpage-9'},
 'webpage-5': {'webpage-2', 'webpage-4'},
 'webpage-6': set(),
 'webpage-7': {'webpage-1', 'webpage-3', 'webpage-4'},
 'webpage-8': {'webpage-1'},
 'webpage-9': {'webpage-1',
  'webpage-10',
  'webpage-2',
  'webpage-3',
  'webpage-8'}}

In [11]:
website_index

{'webpage-1': 5,
 'webpage-10': 6,
 'webpage-2': 1,
 'webpage-3': 8,
 'webpage-4': 3,
 'webpage-5': 7,
 'webpage-6': 0,
 'webpage-7': 2,
 'webpage-8': 9,
 'webpage-9': 4}

In [12]:
def pageRank(A, eps=.0001, d=.85):
    P = np.ones(len(A)) / len(A)
    while True:
        new_P = np.ones(len(A)) * (1 - d) / (len(A)) + d * A.T.dot(P)
        delta = abs(new_P - P).sum()
        if delta <= eps:
            return new_P
        else:
            P = new_P

In [13]:
results = pageRank(A)

In [14]:
print([item[0] for item in sorted(enumerate(results), key=lambda item: -item[1])])

[4, 6, 1, 5, 0, 7, 9, 3, 8, 2]


In [15]:
website_index

{'webpage-1': 5,
 'webpage-10': 6,
 'webpage-2': 1,
 'webpage-3': 8,
 'webpage-4': 3,
 'webpage-5': 7,
 'webpage-6': 0,
 'webpage-7': 2,
 'webpage-8': 9,
 'webpage-9': 4}

In [16]:
# TextRank

In [17]:
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance

In [18]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sent1 = [w.lower for w in sent1]
    sent2 = [w.lower for w in sent2]
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    # build the vecotr for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        else:
            vector1[all_words.index(w)] += 1
    # build the vecotr for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        else:
            vector2[all_words.index(w)] += 1
    
    return cosine_distance(vector1, vector2)

In [19]:
print(sentence_similarity("This is a good sentence".split(), "This is a bad sentence".split()))

0.8


In [20]:
sentences = brown.sents('ca01')

In [21]:
stop_words = stopwords.words('english')

In [22]:
def build_similarity_matrix(sentences, stopwords=None):
    # create an empty similarity matrix
    S = np.zeros((len(sentences), len(sentences)))
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            else:
                S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    
    # normalise the matrix row-wise
    for idx in range(len(S)):
        S[idx] /= S[idx].sum()
    return S

In [23]:
S = build_similarity_matrix(sentences, stop_words)

In [24]:
sentences

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [25]:
def textrank(sentences, top_n=5, stopwords=None):
    """
    sentences = a list of sentences [[w11, w12, ...], [w21, w22, ...], ...]
    top_n = how may sentences the summary should contain
    stopwords = a list of stopwords
    """
    S = build_similarity_matrix(sentences, stop_words) 
    sentence_ranks = pageRank(S)
 
    # Sort the sentence ranks
    ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]
    selected_sentences = sorted(ranked_sentence_indexes[:top_n])
    summary = itemgetter(*selected_sentences)(sentences)
    return summary

In [27]:
import pickle

In [28]:
with open('../../pythonprojects/Standard_scraper/output/labeled_newspaper_articles.pickle', 'rb') as handle:
    t = pickle.load(handle)

In [29]:
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens
    #stems = []
    #for item in tokens: 
    #    stems.append(PorterStemmer().stem(item))
    #return stems

In [30]:
from operator import itemgetter

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer

In [32]:
from nltk import sent_tokenize

In [47]:
text = t[206667280852270103159815161964730877912]['Text'].replace('.', '. ').replace('or register with your social accountAlready have an account? Log in', '')

In [53]:
text = """Saudi Arabia promised to act decisively to keep oil prices under control, signaling a real supply boost approaching 1 million barrels a day is on the way to global markets.

“We will do whatever is necessary to keep the market in balance,” Saudi Energy Minister Khalid Al-Falih told reporters on Saturday, while sitting alongside his Russian counterpart Alexander Novak at OPEC headquarters in Vienna. Consumers can rest assured that “their energy supplies are available, are being stewarded by a responsible group of producers.”

After a last-minute compromise that overcame Iranian opposition, Friday’s OPEC agreement delivered a pledge for a “nominal” supply increase of 1 million barrels a day. In reality, several countries are unable to pump more so the real output boost would have been smaller -- ranging from Iran’s 500,000 barrel-a-day estimate up to Iraq’s prediction for as much as 800,000.

Saturday’s agreement dropped the pledge that the 1 million barrel-a-day increase should be shared proportionally among members, opening the way for the full volume to flow, Al-Falih said.

"If we allocated the number pro-rata basis among the 24 countries, given the capacity of those countries that can increase, it had been estimated that about 60 percent will be achieved,” Al-Falih said. “But because we went away from allocation on a pro-rata basis, we will be closer to 1 million than to 600,000 barrels a day."

The group’s communique still pledged a return to 100 percent compliance with the original 2016 agreement -- ending a period of deeper-than-intended cuts -- but Al-Falih insisted that no individual country will be subject to a strict output cap. That means nations including Saudi Arabia can fill the gap left by falling production elsewhere in the OPEC+ alliance.

Al-Falih also said that a committee dominated by Saudi Arabia and Russia will take direct responsibility for overseeing the flow of additional oil, cementing the two nations’ dominance over a group that pumps more than half the world’s crude.

This central role for the two chief proponents of increasing oil supplies to alleviate high prices could provide some assurance to traders, who spurred the biggest rally in U.S. crude futures in six months on Friday after the Organization of Petroleum Exporting Countries published a vaguely worded agreement that fell short of specific output pledges."""

In [54]:
sentences = [tokenize(x) for x in sent_tokenize(text)]

In [55]:
len(sentences)

11

In [56]:
for idx, sentence in enumerate(textrank(sentences, stopwords=stopwords.words('english'))):
    print("%s. %s" % ((idx + 1), ' '.join(sentence)))

1. “ We will do whatever is necessary to keep the market in balance , ” Saudi Energy Minister Khalid Al-Falih told reporters on Saturday , while sitting alongside his Russian counterpart Alexander Novak at OPEC headquarters in Vienna .
2. In reality , several countries are unable to pump more so the real output boost would have been smaller -- ranging from Iran ’ s 500,000 barrel-a-day estimate up to Iraq ’ s prediction for as much as 800,000 .
3. `` If we allocated the number pro-rata basis among the 24 countries , given the capacity of those countries that can increase , it had been estimated that about 60 percent will be achieved , ” Al-Falih said .
4. That means nations including Saudi Arabia can fill the gap left by falling production elsewhere in the OPEC+ alliance .
5. This central role for the two chief proponents of increasing oil supplies to alleviate high prices could provide some assurance to traders , who spurred the biggest rally in U.S. crude futures in six months on Fri