# Keyphrase Extraction
## subsections of Text Summarization and Topic Models

* Text Summarization and Information Extraction
* Important Concepts
* Keyphrase Extractions
    1. Collocations
    2. Weighted Tag-Based Phrase Extraction
* Topic Modeling on Research Papers
    1. The Main Objective
    2. Data Retrieval
    3. Load and View Dataset
    4. Basic Text Wrangling

# Important Concepts

In [None]:
# extract top k singular values and return corresponding U, S, & V matrices
from scipy.sparse.linalg import svds

def low_rank_svd(matrix, singular_count=2):
    u,s,vt = svds(matrix, k=singular_count)
    return u,s,vt

# Keyphrase Extraction

## Collocations

In [None]:
from nltk.corpus import gutenberg
import text_normalizer as tn
import nltk
from operator import itemgetter

# load corpus
alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = list(filter(None,
                         tn.normalize_corpus(alice, text_lemmatization=False)))

# print and compare first line
print(alice[0], '\n', norm_alice[0])

In [None]:
def compute_ngrams(sequence, n):
    return list(
            zip(*(sequence[index:]
                  for index in range(n))))

# test function
compute_ngrams([1,2,3,4], 2) # bi-grams
compute_ngrams([1,2,3,4], 3) # tri-grams

In [None]:
# function to flatten corpus into one big string of text
def flatten_corpus(corpus):
    return ' '.join([document.strip()
                    for document in corpus])

# get top n-grams for corpus of text
def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq)
                     for text, freq in sorted_ngrams]
    return sorted_ngrams

In [None]:
# top 10 bigrams
get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10)

In [None]:
# top 10 trigrams
get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10)

In [None]:
# use NLTK's collocation finders
# bigrams
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents([item.split() for item in norm_alice])
finder

In [None]:
bigram_measures = BigramAssocMeasures()

# raw frequencies
finder.nbest(bigram_measures.raw_freq, 10)

In [None]:
# pointwise mutual information
finder.nbest(bigram_measures.pmi, 10)

In [None]:
# trigrams
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents([item.split() for item in norm_alice])

trigram_measures = TrigramAssocMeasures()

In [None]:
# raw frequencies
finder.nbest(trigram_measures.raw_freq, 10)

In [None]:
# pointwise mutual information
finder.nbest(trigram_measures.pmi, 10)

## Weighted Tag-Based Phrase Extraction

In [None]:
data = open('data/elephants.txt', 'r+').readlines()
sentences = nltk.sent_tokenize(data[0])
len(sentences)

In [None]:
# viewing the first three lines
sentences[:3]

In [None]:
norm_sentences = tn.normalize_corpus(sentences, text_lower_case=False, text_stemming=False,
                                     text_lemmatization=False, stopword_removal=False)
norm_sentences[:3]

In [None]:
import itertools
stopwords = nltk.corpus.stopwords.words('english')

def get_chunks(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', stopword_list=stopwords):
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    for sentence in sentences:
        tagged_sents = [nltk.pos_tag(nltk.word_tokenize(sentence))]
        chunks = [chunker.parse(tagged_sent)
                     for tagged_sent in tagged_sents]
        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                        for chunk in chunks]
        flattened_chunks = list(itertools.chain.from_iterable(wtc_sent for wtc_sent in wtc_sents))
        valid_chunks_tagged = [(status, [wtc for wtc in chunk])
                                    for status, chunk in itertools.groupby(flattened_chunks,
                                                      lambda word_pos_chunk: 
                                                      word_pos_chunk[2] != 'O')]
        valid_chunks = [' '.join(word.lower()
                                 for word, tag, chunk in wtc_group
                                     if word.lower() not in stopword_list)
                                        for status, wtc_group in valid_chunks_tagged if status]
        all_chunks.append(valid_chunks)
    return all_chunks

In [None]:
chunks = get_chunks(norm_sentences)
chunks

In [None]:
from gensim import corpora, models

def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10):
    valid_chunks = get_chunks(sentences, grammar=grammar)
    
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(idx): value for doc in corpus_tfidf for idx, value in doc}
    weighted_phrases = sorted(weighted_phrases.items(),
                              key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt,3)) for term, wt in weighted_phrases]
    
    return weighted_phrases[:top_n]

In [None]:
# top 30 tf-idf weighted keyphrases
get_tfidf_weighted_keyphrases(sentences=norm_sentences, top_n=30)

In [None]:
from gensim.summarization import keywords

key_words = keywords(data[0], ratio=1.0, scores=True, lemmatize=True)
[(item, round(score,3)) for item, score in key_words][:25]

# Topic Modeling on Research Papers

## Data Retrieval

In [None]:
#!wget https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz

In [None]:
# extract dataset
#!tar -xzf nips12raw_str602.tgz

In [None]:
import os
import numpy as np
import pandas as pd

filename = '/data/nips_data/nipstxt/'
print(os.listdir(DATA_PATH))

## Load and View Dataset

In [None]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
# read all texts into a list
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
len(papers)

In [None]:
print(papers[0][:1000])

## Basic Text Wrangling

In [None]:
%%time

import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
        
    return norm_papers

norm_papers = normalize_corpus(papers)
print(len(norm_papers))

In [None]:
# viewing a processed paper
print(norm_papers[0][:50])