**1.Extract and print subject-verb-object (SVO) relations from each sentence**

Extract text from article

In [2]:
from bs4 import BeautifulSoup
from bs4.element import Comment

In [15]:
import requests
article = "https://www.express.co.uk/life-style/science-technology/1334712/Spotify-Update-New-Karaoke-Mode-vs-Apple-Music-TIDAL-UK"
url = requests.get(article).text    

In [58]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

text = text_from_html(url)
print(text[0:2000])

                                        Express. Home of the Daily and Sunday Express.      9 °C   Find us on Facebook  Follow us on Twitter  Find us on Instagram     Login  Register   Your Account  Sign Out     Apps  Puzzles  Horoscopes  In Your Area  Shop  Paper  HOME  News  Showbiz & TV  Sport  Comment  Finance  Travel  Entertainment  Life & Style  Life  Style  Health  Property  Cars  Food  Tech  Diets  Garden Home Life & Style Tech    Spotify is working on the karaoke mode Apple Music and TiDAL subscribers can only dream of SPOTIFY could be bringing an all-new party-focused karaoke mode to your music streaming app. Here's what we know so far. By Aaron Brown  PUBLISHED:  11:51, Sun, Sep 13, 2020  | UPDATED: 11:51, Sun, Sep 13, 2020    0                  Link copied     Spotify subscribers who like a singalong could be in luck with the next update coming to the app (Image: SPOTIFY • GETTY) Sign up to the Secret Elves Club for FREE now   SUBSCRIBE   Invalid email We will use your emai

In [22]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

In [23]:
def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

In [24]:
def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

In [25]:
def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

In [26]:
def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

In [27]:
def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

In [28]:
def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

In [29]:
def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

In [30]:
def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

In [31]:
def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

In [32]:
def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

In [33]:
def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

In [34]:
def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

In [37]:
def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

def testSVOs():
    #nlp = English()

    tok = nlp(text)
    svos = findSVOs(tok)
    #printDeps(tok)
    print(svos)

    print("-----------------------------------------------")
    tok = nlp(text)
    svos = findSVOs(tok)
    #printDeps(tok)
    #print(svos)
    #assert set(svos) == {('they', 'ate', 'pizza')}

In [38]:
if __name__ == "__main__":
    testSVOs()

[('c', 'find', 'us'), ('dream', 'bringing', 'mode'), ('who', 'like', 'singalong'), ('we', 'use', 'address'), ('playlists', 'show', 'talents'), ('plan', 'tackle', 'unwrapped'), ('you', 'test', 'it'), ('users', 'unlock', 'subscription'), ('wong', 'unearthed', 'hints'), ('spotify', 'copying', 'option'), ('spotify', 'drum', 'subscribers'), ('spotify', 'confirmed', 'plans'), ('plans', 'bring', 'lyrics'), ('it', 'add', 'sense'), ('it', 'add', 'ability'), ('ability', 'lower', 'volume'), ('ability', 'gamify', 'app'), ('owners', 'beam', 'feature'), ('screen', 'bring', 'tv'), ('screen', 'bring', 'karaoke'), ('screen', 'bring', 'karaoke'), ('wong', 'used', 'choice'), ('companies', 'investigate', 'technology'), ('mode', '!makes', 'it'), ('mode', '!makes', 'something'), ('spokesperson', 'told', 'nme'), ('we', 'conduct', 'number'), ('effort', 'improve', 'experience'), ('millions', 'leaving', 'themselves'), ('millions', 'leaving', 'open'), ('tvs', 'got', 'update'), ('uk', 'losing', 'generation'), ('m

**2. Apply TextRank for ranking and selecting key phrases, print the result**

In [43]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [44]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [45]:
keyphrase_extractor = TextRank4Keyword()

In [46]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN',"ADP"], window_size=8, lower=False)
tr4w.get_keywords(10)

Spotify - 6.9637856278551835
Apple - 5.532575881642347
iPhone - 4.616603441426879
Express - 3.862522151991962
TV - 3.1182022189873417
app - 2.997285694605152
karaoke - 2.4837745442725874
mode - 2.46103851251492
update - 2.433547292358249
feature - 2.4250435484942985
music - 2.401084104022638
Tech - 2.3247663595193315


**3. Apply LexRank to produce an extractive summary of 5 sentences**

In [49]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

class TextSummary(object):

    def __init__(self, feeds_str, num_sents):
        self.summary = str()
        
        parser = PlaintextParser.from_string(feeds_str, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        sentences = summarizer(parser.document, num_sents)  # Summarize the document with 5 sentences
        for sentence in sentences:
            self.summary += (sentence.__unicode__())

    def output(self):
        return self.summary

In [59]:
text_to_sum = TextSummary(text,5)
print(text_to_sum.output())


Home of the Daily and Sunday Express.9 °C   Find us on Facebook  Follow us on Twitter  Find us on Instagram     Login  Register   Your Account  Sign Out     Apps  Puzzles  Horoscopes  In Your Area  Shop  Paper  HOME  News  Showbiz & TV  Sport  Comment  Finance  Travel  Entertainment  Life & Style  Life  Style  Health  Property  Cars  Food  Tech  Diets  Garden Home Life & Style Tech    Spotify is working on the karaoke mode Apple Music and TiDAL subscribers can only dream of SPOTIFY could be bringing an all-new party-focused karaoke mode to your music streaming app.MORE LIKE THIS Free Google Nest Mini with Spotify Premium deal is OVER in just 4 days Spotify is working on Karaoke Mode the vocal level is adjustable pic.twitter.com/apeIlETAQs — Jane Manchun Wong (@wongmjane) September 7, 2020  With that feature baked into the app, it makes sense to add the ability to lower the volume of the singer and gamify the streaming music app.It's possible that karaoke mode never makes it into an upd