## EXTRACT KEYWORDS AND KEYPHRASES FROM TEXT

In [1]:
import os
import spacy
import spacy   
from spacy.matcher import Matcher
from spacy.util import filter_spans
import subprocess
import numpy as np
import pandas as pd
from nltk import ngrams
from nltk.tokenize import word_tokenize
from string import digits, punctuation
from collections import Counter

nlp = spacy.load('en_core_web_lg')

In [7]:
def extract_keyphrases(nlp, text_, add_keywords = False):
    
    '''
    Get keyphrases as noun & verb phrases based on a limited set of POS tags
        Arguments:
            nlp          - loaded Spacy model
            text_        - text as one string to have keywords extracted from
            add_keywords - if single keywords need to be added
        Returns:
            result - list of all keywords / keyphrases with repetitions
    '''    
    
    # POS tags allowed in keywords / keyphrases    
    tag_set = ['PROPN','NOUN','ADJ', 'ADV']

    # create a spacy doc object
    doc = nlp(text_.lower())
        
    result = []
        
    # get noun chunks and remove irrelevant POS tags (articles etc.)
    for chunk in doc.noun_chunks:
        final_chunk = ''
        for token in chunk:
            if token.pos_ in tag_set and not token.text in nlp.Defaults.stop_words and not token.text in punctuation:
                final_chunk =  final_chunk + token.text + " "
        if final_chunk:
            result.append(final_chunk.strip())    
    count = len(result)
    print('* * * * Discovered {} noun phrases * * * *'.format(count))

    # get verb chunks    
    # instantiate a Matcher instance
    pattern = [{'POS': 'VERB', 'OP': '?'},
               {'POS': 'ADV', 'OP': '*'},                   
               {'POS': 'VERB', 'OP': '+'}]
    matcher = Matcher(nlp.vocab)
    matcher.add('Verb phrases', None, pattern)
    
    # find matches 
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    spans = filter_spans(spans)
    for item in spans:
        final_token = ''
        for token in item:
            if not token.text in nlp.Defaults.stop_words and not token.text in punctuation:
                final_token = final_token + token.text + ' '
        if final_token:
            result.append(final_token.strip())
    count = len(result) - count
    print('* * * * Discovered {} verb phrases * * * *'.format(count))

    # get keywords if needed
    if add_keywords:
        for token in doc:
            if (token.text in nlp.Defaults.stop_words or token.text in punctuation):
                continue
            if (token.pos_ in tag_set):
                result.append(token.text)
        count = len(result) - count
        print('* * * * Discovered {} keywords * * * *'.format(count))
        print('* * * * Total number of keyphrases and keywords - {} * * * *'.format(len(result)))
        print('* * * * Total number of unique keyphrases and keywords - {} * * * *'.format(len(set(result))))
        
    if not add_keywords:
        print('* * * * Total number of keyphrases - {} * * * *'.format(len(result)))
        print('* * * * Total number of unique keyphrases - {} * * * *'.format(len(set(result))))
    return result

In [8]:
# LOAD TEXT
top_dir = 'take_or_pay'
top_file = 'top_clauses.txt'
full_path = os.path.join(top_dir, top_file)
with open(full_path) as f:
    text = f.read()

In [9]:
# EXTRACT KEYWORDS AND KEYPHRASES
all_words = extract_keyphrases(nlp, text)

* * * * Discovered 1969 noun phrases * * * *
* * * * Discovered 878 verb phrases * * * *
* * * * Total number of keyphrases - 2847 * * * *
* * * * Total number of unique keyphrases - 1316 * * * *


In [10]:
# GET FREQUENCIES
c = Counter(all_words)
res = c.most_common()
print('Total number of entries -', len(res))
for item in res[:100]:
    print(item)

Total number of entries - 1316
('seller', 105)
('buyer', 104)
('pay', 77)
('quantity', 34)
('contract', 32)
('delivery', 28)
('agreement', 22)
('supplier', 22)
('parties', 21)
('pay clauses', 21)
('commodity', 19)
('taken', 19)
('obligation', 16)
('clauses', 15)
('gas', 14)
('payment', 14)
('pay clause', 14)
('breach', 14)
('buyers', 14)
('clause', 14)
('required', 14)
('damages', 12)
('agreed', 12)
('price', 11)
('quantities', 11)
('year', 10)
('default', 10)
('cases', 10)
('article', 10)
('sellers', 10)
('depa', 10)
('courts', 9)
('failure', 9)
('losses', 9)
('suppliers', 9)
('terms', 9)
('claim', 9)
('deliver', 9)
('fails', 9)
('right', 8)
('contracts', 8)
('pay contract', 8)
('obligations', 8)
('order', 8)
('loss', 8)
('profit', 8)
('customers', 8)
('provide', 8)
('provided', 8)
('lng', 7)
('event', 7)
('account', 7)
('section', 7)
('party', 7)
('natural gas', 7)
('offtake', 7)
('customer', 7)
('shall', 7)
('given', 7)
('reduced', 7)
('receive', 7)
('sale', 6)
('provisions', 6)
('b

In [11]:
# SAVE TO FILE IN THE ORDER OF DECREASING FREQUENCY
to_save = [item[0] for item in res]
with open('keyphrases_extracted_take_or_pay.txt', 'w', encoding='utf8') as f:
    for item in to_save:
        f.write(item + '\n')

### APPENDIX

In [None]:
def tokenize(s):
    s = s.replace('-', ' ')
    return s.strip().lower().translate(str.maketrans('', '', punctuation)).translate(str.maketrans('', '', digits)).split()

def clean(text):
    
    tokens = tokenize(text)
    #tokens = [t for t in tokens if t not in stop_words]
    #tokens = [spanish_stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 1]    
    return ' '.join(tokens)