Skip to content
Permalink
Browse files

Refactored some files to be more PEP8 compliant

Using PyCharm automatic Code inspection
  • Loading branch information...
ygorg committed Oct 26, 2018
1 parent 0bd3c9d commit 8f99855ab139675587399e335ea07f91b233625a
@@ -17,7 +17,6 @@
sys.path.insert(0, os.path.abspath('../../'))
import pke


# -- Project information -----------------------------------------------------

project = 'pke'
@@ -166,4 +165,4 @@
# -- Options for todo extension ----------------------------------------------

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
todo_include_todos = True
@@ -1,11 +1,10 @@
# -*- coding: utf-8 -*-

import os
import logging
import sys
from string import punctuation

from pke import compute_document_frequency
from string import punctuation

# setting info in terminal
logging.basicConfig(level=logging.INFO)
@@ -23,10 +22,10 @@
# compute idf weights
compute_document_frequency(input_dir=input_dir,
output_file=output_file,
format="corenlp", # input files format
use_lemmas=False, # do not use Stanford lemmas
stemmer="porter", # use porter stemmer
stoplist=stoplist, # stoplist
delimiter='\t', # tab separated output
extension='xml', # input files extension
n=5) # compute n-grams up to 5-grams
format="corenlp", # input files format
use_lemmas=False, # do not use Stanford lemmas
stemmer="porter", # use porter stemmer
stoplist=stoplist, # stoplist
delimiter='\t', # tab separated output
extension='xml', # input files extension
n=5) # compute n-grams up to 5-grams
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-

import os
import logging
import sys

@@ -22,8 +21,8 @@
compute_lda_model(input_dir=input_dir,
output_file=output_file,
n_topics=n_topics,
format="corenlp", # input files format
extension='xml', # input files extension
use_lemmas=False, # do not use Stanford lemmas
stemmer="porter", # use porter stemmer
language="english") # language for the stop_words
format="corenlp", # input files format
extension='xml', # input files extension
use_lemmas=False, # do not use Stanford lemmas
stemmer="porter", # use porter stemmer
language="english") # language for the stop_words
@@ -15,8 +15,8 @@

# select the keyphrase candidates, for TopicRank the longest sequences of
# nouns and adjectives
extractor.candidate_selection(pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR',
'JJS'])
extractor.candidate_selection(pos={'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR',
'JJS'})

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
@@ -26,4 +26,4 @@

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10):
print(keyphrase, score)
print(keyphrase, score)
@@ -1,9 +1,8 @@
# -*- coding: utf-8 -*-

import os
import sys
import codecs
import logging
import sys

import pke

# setting info in terminal
@@ -17,7 +16,7 @@

# path to the df file
df_file = sys.argv[3]
logging.info('loading df counts from '+df_file)
logging.info('loading df counts from ' + df_file)
df_counts = pke.load_document_frequency_file(df_file, delimiter='\t')

# path to the model, saved as a pickle
@@ -32,4 +31,4 @@
stemmer="porter",
model=pke.supervised.Kea(),
language='english',
extension="xml")
extension="xml")
@@ -18,11 +18,11 @@

# load the df counts
df_counts = pke.load_document_frequency_file(input_file="df.tsv.gz",
delimiter='\t')
delimiter='\t')

# weight the candidates using Kea model.
extractor.candidate_weighting(model_file="model.pickle", df=df_counts)

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10):
print(keyphrase, score)
print(keyphrase, score)
@@ -1,9 +1,7 @@
# -*- coding: utf-8 -*-

import os
import sys
import codecs
import logging

import pke

# setting info in terminal
@@ -19,7 +17,7 @@
df_file = "df.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
delimiter='\t')
delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "model.pickle"
@@ -33,4 +31,4 @@
stemmer="porter",
model=pke.supervised.Kea(),
language='english',
extension="xml")
extension="xml")
@@ -106,9 +106,9 @@ def is_redundant(self, candidate, prev, mininum_length=1):
prev = [self.candidates[u].lexical_form for u in prev]

# loop through the already selected candidates
for prev_candidate in prev:
for i in range(len(prev_candidate)-len(candidate)+1):
if candidate == prev_candidate[i:i+len(candidate)]:
for prev_candidate in prev:
for i in range(len(prev_candidate) - len(candidate) + 1):
if candidate == prev_candidate[i:i + len(candidate)]:
return True
return False

@@ -159,9 +159,9 @@ def get_n_best(self, n=10, redundancy_removal=False, stemming=False):
self.weights[u]) for u in best[:min(n, len(best))]]

if len(n_best) < n:
logging.warning(
'Not enough candidates to choose from '
'({} requested, {} given)'.format(n, len(n_best)))
logging.warning(
'Not enough candidates to choose from '
'({} requested, {} given)'.format(n, len(n_best)))

# return the list of best candidates
return n_best
@@ -213,13 +213,12 @@ def ngram_selection(self, n=3):

# generate the ngrams
for j in range(sentence.length):
for k in range(j+1, min(j+1+skip, sentence.length+1)):

for k in range(j + 1, min(j + 1 + skip, sentence.length + 1)):
# add the ngram to the candidate container
self.add_candidate(words=sentence.words[j:k],
stems=sentence.stems[j:k],
pos=sentence.pos[j:k],
offset=shift+j,
offset=shift + j,
sentence_id=i)

def longest_pos_sequence_selection(self, valid_pos=None):
@@ -265,10 +264,10 @@ def longest_sequence_selection(self, key, valid_values):
bias = 1

# add the ngram to the candidate container
self.add_candidate(words=sentence.words[seq[0]:seq[-1]+1],
stems=sentence.stems[seq[0]:seq[-1]+1],
pos=sentence.pos[seq[0]:seq[-1]+1],
offset=shift+j-len(seq)+bias,
self.add_candidate(words=sentence.words[seq[0]:seq[-1] + 1],
stems=sentence.stems[seq[0]:seq[-1] + 1],
pos=sentence.pos[seq[0]:seq[-1] + 1],
offset=shift + j - len(seq) + bias,
sentence_id=i)

# flush sequence container
@@ -318,13 +317,14 @@ def grammar_selection(self, grammar=None):
last = int(leaves[-1][0])

# add the NP to the candidate container
self.add_candidate(words=sentence.words[first:last+1],
stems=sentence.stems[first:last+1],
pos=sentence.pos[first:last+1],
offset=shift+first,
self.add_candidate(words=sentence.words[first:last + 1],
stems=sentence.stems[first:last + 1],
pos=sentence.pos[first:last + 1],
offset=shift + first,
sentence_id=i)

def _is_alphanum(self, word, valid_punctuation_marks='-'):
@staticmethod
def _is_alphanum(word, valid_punctuation_marks='-'):
"""Check if a word is valid, i.e. it contains only alpha-numeric
characters and valid punctuation marks.
@@ -403,4 +403,3 @@ def candidate_filtering(self,
if only_alphanum and k in self.candidates:
if not all([self._is_alphanum(w) for w in words]):
del self.candidates[k]

@@ -5,4 +5,4 @@

from pke.supervised.api import SupervisedLoadFile
from pke.supervised.feature_based.kea import Kea
from pke.supervised.feature_based.wingnus import WINGNUS
from pke.supervised.feature_based.wingnus import WINGNUS
@@ -7,12 +7,12 @@

import os
import six
import pickle

from pke.base import LoadFile
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib


class SupervisedLoadFile(LoadFile):
""" The SupervisedLoadFile class that provides extra base functions for
supervised models. """
@@ -25,7 +25,6 @@ def __init__(self):
self.instances = {}
""" The instances container. """


def feature_scaling(self):
""" Scale features to [0,1]. """

@@ -35,12 +34,10 @@ def feature_scaling(self):
for i, candidate in enumerate(candidates):
self.instances[candidate] = X[i]


def feature_extraction(self):
""" Skeletton for feature extraction. """
pass


def classify_candidates(self, model=None):
""" Classify the candidates as keyphrase or not keyphrase.
@@ -55,10 +52,10 @@ def classify_candidates(self, model=None):
# model = os.path.join(self._models, instance+"-semeval2010.pickle")
if six.PY2:
model = os.path.join(self._models,
instance+"-semeval2010.py2.pickle")
instance + "-semeval2010.py2.pickle")
else:
model = os.path.join(self._models,
instance+"-semeval2010.py3.pickle")
instance + "-semeval2010.py3.pickle")

# load the model
clf = joblib.load(model)
@@ -75,10 +72,8 @@ def classify_candidates(self, model=None):
for i, candidate in enumerate(candidates):
self.weights[candidate] = y[i][1]


def candidate_weighting(self):
""" Extract features and classify candidates with default parameters."""

self.feature_extraction()
self.classify_candidates()

@@ -1,2 +1,2 @@
# -*- coding: utf-8 -*-
# Python Keyphrase Extraction toolkit: supervised feature-based ranking models
# Python Keyphrase Extraction toolkit: supervised feature-based ranking models
@@ -17,17 +17,15 @@
from __future__ import division
from __future__ import print_function

from pke.supervised.api import SupervisedLoadFile
from pke.utils import load_document_frequency_file

import math
import string
import numpy as np

from nltk.corpus import stopwords

from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB

from pke.supervised.api import SupervisedLoadFile
from pke.utils import load_document_frequency_file


class Kea(SupervisedLoadFile):
@@ -99,7 +97,6 @@ def candidate_selection(self, stoplist=None, **kwargs):
if words[0] in stoplist or words[-1] in stoplist:
del self.candidates[k]


def feature_extraction(self, df=None, training=False):
"""Extract features (tf*idf, first occurrence and length) for each
candidate.
@@ -137,12 +134,11 @@ def feature_extraction(self, df=None, training=False):

# add the features to the instance container
self.instances[k] = np.array([len(v.surface_forms) * idf,
v.offsets[0]/maximum_offset])
v.offsets[0] / maximum_offset])

# scale features
self.feature_scaling()


def candidate_weighting(self, model_file=None, df=None):
"""Extract features and classify candidates.
@@ -151,11 +147,10 @@ def candidate_weighting(self, model_file=None, df=None):
df (dict): document frequencies, the number of documents should
be specified using the "--NB_DOC--" key.
"""

self.feature_extraction(df=df)
self.classify_candidates(model=model_file)


@staticmethod
def train(training_instances, training_classes, model_file):
""" Train a Naive Bayes classifier and store the model in a file.
@@ -169,4 +164,3 @@ def train(training_instances, training_classes, model_file):
clf = MultinomialNB()
clf.fit(training_instances, training_classes)
joblib.dump(clf, model_file)

0 comments on commit 8f99855

Please sign in to comment.
You can’t perform that action at this time.