Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First push for my Python Keyphrase Extraction module
- Loading branch information
0 parents
commit aecea3d
Showing
11 changed files
with
1,190 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,18 @@ | |||
# pke | |||
|
|||
Python Keyphrase Extraction module | |||
|
|||
To install this module: | |||
|
|||
pip install git+https://github.com/boudinfl/pke.git | |||
|
|||
A typical usage of this module is: | |||
|
|||
import pke | |||
|
|||
doc = pke.SingleRank(input_file=sys.argv[1]) | |||
doc.read_corenlp_document() | |||
doc.candidate_selection() | |||
doc.candidate_weighting() | |||
print (u';'.join([u for u,v in doc.get_n_best(n=10)])).encode('utf-8') | |||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,2 @@ | |||
from pke import * | |||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,2 @@ | |||
from base import * | |||
from unsupervised import * |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,178 @@ | |||
# -*- coding: utf-8 -*- | |||
|
|||
import re | |||
|
|||
from corenlp_parser import MinimalCoreNLPParser | |||
|
|||
from collections import defaultdict | |||
|
|||
from nltk.stem.snowball import SnowballStemmer as stemmer | |||
from nltk.corpus import stopwords | |||
|
|||
class Sentence: | |||
""" The sentence data structure. """ | |||
|
|||
def __init__(self, words): | |||
|
|||
self.words = words | |||
""" tokens as a list. """ | |||
|
|||
self.POS = [] | |||
""" Part-Of-Speeches as a list. """ | |||
|
|||
self.stems = [] | |||
""" stems as a list. """ | |||
|
|||
self.length = len(words) | |||
""" length of the sentence. """ | |||
|
|||
|
|||
class Candidate: | |||
""" The keyphrase candidate data structure. """ | |||
|
|||
def __init__(self): | |||
|
|||
self.surface_forms = [] | |||
""" the surface forms of the candidate. """ | |||
|
|||
self.offsets = [] | |||
""" the offsets of the surface forms. """ | |||
|
|||
self.lexical_form = [] | |||
""" the lexical form of the candidate. """ | |||
|
|||
|
|||
class LoadFile(object): | |||
""" The LoadFile class that provides base functions. """ | |||
|
|||
def __init__(self, input_file): | |||
""" Initializer for LoadFile class. | |||
Args: | |||
input_file (str): the path of the input file. | |||
""" | |||
|
|||
self.input_file = input_file | |||
""" The path of the input file. """ | |||
|
|||
self.sentences = [] | |||
""" The sentence container (list of Sentence). """ | |||
|
|||
self.candidates = defaultdict(Candidate) | |||
""" The candidate container (dict of Candidate). """ | |||
|
|||
self.weights = {} | |||
""" The weight container (can be either word or candidate weights). """ | |||
|
|||
|
|||
def read_corenlp_document(self, use_lemmas=True, stemming="porter"): | |||
""" Read the input file in CoreNLP XML format and populate the sentence | |||
list. | |||
Args: | |||
use_lemmas (bool): weither lemmas from stanford corenlp are used | |||
instead of stems (computed by nltk), defaults to True. | |||
stemming (str): the language of the stemming (if used), defaults | |||
to porter. | |||
""" | |||
|
|||
parse = MinimalCoreNLPParser(self.input_file) | |||
|
|||
for i, sentence in enumerate(parse.sentences): | |||
s = Sentence(words=sentence["words"]) | |||
s.POS = sentence["POS"] | |||
if use_lemmas: | |||
s.stems = [t.lower() for t in sentence["lemmas"]] | |||
else: | |||
s.stems = [stemmer(stemming).stem(t.lower()) for t in s.words] | |||
self.sentences.append(s) | |||
|
|||
|
|||
def get_n_best(self, n=10): | |||
""" Returns the n-best candidates given the weights. """ | |||
|
|||
best = sorted(self.weights, key=self.weights.get, reverse=True) | |||
return [(u, self.weights[u]) for u in best[:min(n, len(best))]] | |||
|
|||
|
|||
def ngram_selection(self, n=3): | |||
""" Select all the n-grams and populate the candidate container. | |||
Args: | |||
n (int): the n-gram length, defaults to 3. | |||
""" | |||
|
|||
for i, sentence in enumerate(self.sentences): | |||
|
|||
skip = min(n, sentence.length) | |||
shift = sum([s.length for s in self.sentences[0:i]]) | |||
|
|||
for j in range(sentence.length): | |||
for k in range(j+1, min(j+1+skip, sentence.length+1)): | |||
|
|||
surface_form = sentence.words[j:k] | |||
norm_form = sentence.stems[j:k] | |||
lex_form = ' '.join(norm_form) | |||
|
|||
self.candidates[lex_form].surface_forms.append(surface_form) | |||
self.candidates[lex_form].lexical_form = norm_form | |||
self.candidates[lex_form].offsets.append(shift+j) | |||
|
|||
|
|||
def sequence_selection(self, pos=['NN', 'NNS', 'NNP', 'NNPS', | |||
'JJ', 'JJR', 'JJS']): | |||
""" Select all the n-grams and populate the candidate container. | |||
Args: | |||
n (int): the n-gram length, defaults to 3. | |||
""" | |||
|
|||
for i, sentence in enumerate(self.sentences): | |||
|
|||
shift = sum([s.length for s in self.sentences[0:i]]) | |||
seq = [] | |||
|
|||
for j in range(sentence.length): | |||
|
|||
# add candidate offset in sequence and continue if not last word | |||
if sentence.POS[j] in pos: | |||
seq.append(j) | |||
if j < (sentence.length - 1): | |||
continue | |||
|
|||
# add candidate | |||
if seq: | |||
surface_form = sentence.words[seq[0]:seq[-1]+1] | |||
norm_form = sentence.stems[seq[0]:seq[-1]+1] | |||
lex_form = ' '.join(norm_form) | |||
self.candidates[lex_form].surface_forms.append(surface_form) | |||
self.candidates[lex_form].lexical_form = norm_form | |||
self.candidates[lex_form].offsets.append(shift+j) | |||
|
|||
# flush sequence container | |||
seq = [] | |||
|
|||
|
|||
def candidate_filtering(self, stoplist=[]): | |||
""" Filter the candidates containing strings from the stoplist. | |||
Args: | |||
stoplist (list): list of strings, defaults to empty. | |||
""" | |||
|
|||
for k, v in self.candidates.items(): | |||
words = [u.lower() for u in v.surface_forms[0]] | |||
if set(words).intersection(stoplist): | |||
del self.candidates[k] | |||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
Binary file not shown.
Oops, something went wrong.