Based on http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/ heuristic method

In [1]:
import os
import itertools
import nltk
import string
import math
import gensim

We limit ourselves to only noun phrases matching the POS pattern ```{(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}``` (a regular expression written in a simplified format used by NLTK’s RegexpParser()). This matches any number of adjectives followed at least by one noun that may be joined by a preposition to (optionally) any number of adjectives followed by other noun(s) sequence, and results in the following candidates:

In [2]:
def read_txt(file):
    with open(file) as text:
        doc = text.read()
    return doc

def extract_candidate_chunks(text_string, max_words=3):
    # Any number of adjectives followed by noun(s) and (optionally) joined
    # by a preposition to any number of adjectives followed by any number of nouns
    grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
    
    # Exclude candidates that are stop words or punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Make chunk using regular expression
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    # Tokenize and POS-tag
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    
    # Join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]

     # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))
    
    candidates = [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
    
    return candidates

def score_keyphrases_tfidf(text_files, number_of_terms=10, max_words=3):    
    # Extract candidate chunks from each text in text_files
    chunked_texts = [extract_candidate_chunks(read_txt(text), max_words=max_words) for text in text_files]

    # Map id and term
    dictionary = gensim.corpora.Dictionary(chunked_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in chunked_texts]
    
    # tf*idf frequency model
    tfidf = gensim.models.TfidfModel(corpus[0:], normalize=False, wglobal=lambda df, D: math.log((1 + D) / (1 + df)) + 1)
    corpus_tfidf = tfidf[corpus][0]
    
    # Sort by score 
    sorted_corpus = sorted(corpus_tfidf, key=lambda item: item[1], reverse=True)
    
    # Compute top n terms
    top_terms = [(dictionary[s[0]], s[1]) for s in sorted_corpus]

    return top_terms[:number_of_terms]

First we assign texts to the list of transcripts, then we call score_keyphrases_by_tfidf(texts) to get all transcripts back in a sparse, tf`*`idf-weighted representation. Then we print out the 10 candidate keyphrases with the highest tf`*`idf values:

In [3]:
sp = 'scripts'
fns = ['script.txt', 'transcript_1.txt', 'transcript_2.txt', 'transcript_3.txt']
text_files = [os.path.join(sp, fn) for fn in fns]
score_keyphrases_tfidf(text_files)

[('food', 46.0),
 ('foods', 15.0),
 ('people', 13.454579064456308),
 ('countries', 12.086604990127926),
 ('example', 11.008291961827886),
 ('prices', 11.008291961827886),
 ('animals', 10.575779366361935),
 ('many cultures', 9.581453659370776),
 ('farmers', 9.581453659370776),
 ('flavor', 9.064953742595945)]

In [5]:
keyphrases = score_keyphrases_tfidf(text_files)
# Print top keywords by TF-IDF
print("Keyphrase....score:")

for term, score in keyphrases:
    print("{}: {:0.1f}".format(term, score))
    print("")


Keyphrase....score:
food: 46.0

foods: 15.0

people: 13.5

countries: 12.1

example: 11.0

prices: 11.0

animals: 10.6

many cultures: 9.6

farmers: 9.6

flavor: 9.1

