Based on http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/

In [1]:
import itertools, nltk, string
import gensim

We limit ourselves to only noun phrases matching the POS pattern {(<NN.*>+ <IN>)? <NN.*>+} (a regular expression written in a simplified format used by NLTK’s RegexpParser()). This matches at least one noun that may be joined by a preposition to one other noun(s) sequence, and results in the following candidates:

In [2]:
def extract_candidate_chunks(text, grammar=r'KT: {(<NN>+ <IN>)? <NN>}'):
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text.read()))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda word_pos_chunk: word_pos_chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

In [3]:
with open('scripts/script.txt', "r", encoding='ISO-8859-1') as text:
    chunk_extraction = set(extract_candidate_chunks(text))
    print("Number of extracted keywords: ", len(chunk_extraction))
    print(chunk_extraction)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/valentina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Number of extracted keywords:  655
{'provision', 'sugar molecule', 'tend', 'deficit need', 'tissue', 'term restaurant', 'mass production of food', 'cyanide', 'animal origin', 'sweet', 'strategy', 'bread', 'appeal', 'year', 'impact on blood sugar', 'increase', 'product', 'groundwater recharge', 'source', 'rioting', 'starvation', 'salty', 'arrival', 'age', 'quality food', 'basis', 'cooking method', 'array', 'butter', 'culture', 'umami', 'source of food', 'bitter', 'salmon', 'rapporteur on food', 'organism', 'professor', 'process'

In [4]:
def score_keyphrases_by_tfidf(texts):    
    # extract candidates from each text in texts, either chunks or words
    boc_texts = [extract_candidate_chunks(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

First we assign texts to the list of transcripts, then we call score_keyphrases_by_tfidf(texts) to get all transcripts back in a sparse, tf*idf-weighted representation. Then we print out the 20 candidate keyphrases with the highest tf*idf values:

In [5]:
with open('scripts/transcript_1.txt', 'r') as a, open('scripts/transcript_2.txt', 'r') as b, open('scripts/transcript_3.txt', 'r') as c:
    texts = [a,b,c]
    corpus_tfidf, dictionary = score_keyphrases_by_tfidf(texts)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/valentina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/valentina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/valentina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /User

In [6]:
# sort each document
sorted_corpus = [sorted(doc, key=lambda item: item[1], reverse=True) for doc in corpus_tfidf]
top_terms_per_document = [[(dictionary[term_id], tfidf_score) for term_id, tfidf_score in doc] for doc in sorted_corpus]
top_terms_per_document

[[('street', 0.20868870702908576),
  ('food industry', 0.20868870702908576),
  ('customer', 0.16695096562326864),
  ('result', 0.16695096562326864),
  ('fast food', 0.1386374265541415),
  ('seating', 0.12521322421745149),
  ('chip', 0.12521322421745149),
  ('snack', 0.12521322421745149),
  ('access', 0.12521322421745149),
  ('government', 0.12521322421745149),
  ('chicken', 0.12521322421745149),
  ('amount', 0.12521322421745149),
  ('filling', 0.12521322421745149),
  ('sauce', 0.12521322421745149),
  ('mayonnaise', 0.12521322421745149),
  ('job', 0.12521322421745149),
  ('restaurant', 0.10782910954211006),
  ('fish', 0.09242495103609434),
  ('development', 0.08347548281163432),
  ('boom', 0.08347548281163432),
  ('go', 0.08347548281163432),
  ('today', 0.08347548281163432),
  ('shop', 0.08347548281163432),
  ('chip shop', 0.08347548281163432),
  ('working', 0.08347548281163432),
  ('place', 0.08347548281163432),
  ('bar', 0.08347548281163432),
  ('turkey', 0.08347548281163432),
  ('piz

In [7]:
top_terms = sorted(sum(top_terms_per_document, []), key=lambda item: item[1], reverse=True)[:20]

In [8]:
top_terms

[('water', 0.35153793261578437),
 ('cooking', 0.3243554785487198),
 ('heat', 0.24607655283104907),
 ('evidence', 0.21092275956947062),
 ('street', 0.20868870702908576),
 ('food industry', 0.20868870702908576),
 ('restaurant', 0.2069158736671446),
 ('beer', 0.20386941674944709),
 ('e.g', 0.20386941674944709),
 ('milk', 0.17576896630789218),
 ('vitamin', 0.17576896630789218),
 ('customer', 0.16695096562326864),
 ('result', 0.16695096562326864),
 ('public', 0.1529020625620853),
 ('staff', 0.1529020625620853),
 ('table', 0.1529020625620853),
 ('dynasty', 0.1529020625620853),
 ('something', 0.1529020625620853),
 ('star', 0.1529020625620853),
 ('ownership', 0.1529020625620853)]