Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #1

Merged
merged 1 commit into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions langid/examples/_twokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,5 @@ def normalizeTextForTagger(text):
# So the tokens you get back may not exactly correspond to
# substrings of the original text.
def tokenizeRawTweetText(text):
tokens = tokenize(normalizeTextForTagger(text))
return tokens
return tokenize(normalizeTextForTagger(text))
adbar marked this conversation as resolved.
Show resolved Hide resolved

27 changes: 8 additions & 19 deletions langid/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,18 +271,16 @@ def instance2fv(self, text):
statecount[state] += 1

# Update all the productions corresponding to the state
for state in statecount:
for state, value in statecount.items():
for index in self.tk_output.get(state, []):
arr[index] += statecount[state]
arr[index] += value
adbar marked this conversation as resolved.
Show resolved Hide resolved

return arr

def nb_classprobs(self, fv):
# compute the partial log-probability of the document given each class
pdc = np.dot(fv,self.nb_ptc)
# compute the partial log-probability of the document in each class
pd = pdc + self.nb_pc
return pd
return pdc + self.nb_pc
adbar marked this conversation as resolved.
Show resolved Hide resolved

def classify(self, text):
"""
Expand Down Expand Up @@ -394,7 +392,7 @@ def application(environ, start_response):
# Catch shift_path_info's failure to handle empty paths properly
path = ''

if path == 'detect' or path == 'rank':
if path in {'detect', 'rank'}:
adbar marked this conversation as resolved.
Show resolved Hide resolved
data = None

# Extract the data component from different access methods
Expand Down Expand Up @@ -445,7 +443,7 @@ def application(environ, start_response):
headers = [('Content-type', 'text/html; charset=utf-8')] # HTTP Headers
start_response(status, headers)
return [query_form.format(**environ)]

else:
# Incorrect URL
status = '404 Not Found'
Expand Down Expand Up @@ -502,12 +500,7 @@ def _process(text):
"""
Set up a local function to do output, configured according to our settings.
"""
if options.dist:
payload = identifier.rank(text)
else:
payload = identifier.classify(text)

return payload
return identifier.rank(text) if options.dist else identifier.classify(text)
adbar marked this conversation as resolved.
Show resolved Hide resolved


if options.url:
Expand Down Expand Up @@ -567,12 +560,8 @@ def _process(text):
def generate_paths():
for line in sys.stdin:
path = line.strip()
if path:
if os.path.isfile(path):
yield path
else:
# No such path
pass
if path and os.path.isfile(path):
yield path
adbar marked this conversation as resolved.
Show resolved Hide resolved

writer = csv.writer(sys.stdout)
pool = mp.Pool()
Expand Down
4 changes: 2 additions & 2 deletions langid/train/DFfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
# Work out the set of features to compute IG
features = set()
for i in range(1, max_order+1):
d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i)
d = {k: doc_count[k] for k in doc_count if len(k) == i}
features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order])
features = sorted(features)

adbar marked this conversation as resolved.
Show resolved Hide resolved
return features


Expand Down
2 changes: 1 addition & 1 deletion langid/train/LDfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):

terms = sorted(term_index, key=term_index.get)
# compile the final feature set
selected_features = dict()
selected_features = {}
adbar marked this conversation as resolved.
Show resolved Hide resolved
for lang_id, lang_w in enumerate(ld):
term_inds = numpy.argsort(lang_w)[-feats_per_lang:]
selected_features[lang_id] = [terms[t] for t in term_inds]
Expand Down
7 changes: 2 additions & 5 deletions langid/train/NBtrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,10 @@ def pass_ptc_progress():

@atexit.register
def cleanup():
global outdir
global outdir
try:
shutil.rmtree(outdir)
except NameError:
pass
except OSError:
# sometimes we try to clean up files that are not there
except (NameError, OSError):
pass
adbar marked this conversation as resolved.
Show resolved Hide resolved

if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions langid/train/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def write_weights(weights, path, sort_by_weight=False):
def read_weights(path):
with open(path) as f:
reader = csv.reader(f)
retval = dict()
retval = {}
adbar marked this conversation as resolved.
Show resolved Hide resolved
for row in reader:
key = eval(row[0])
#val = numpy.array( map(float,row[1:]) )
Expand Down Expand Up @@ -116,7 +116,7 @@ def index(seq):
@param seq the sequence to index
@returns a dictionary from item to position in the sequence
"""
return dict((k,v) for (v,k) in enumerate(seq))
return {k: v for (v,k) in enumerate(seq)}
adbar marked this conversation as resolved.
Show resolved Hide resolved



Expand Down
19 changes: 11 additions & 8 deletions langid/train/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,16 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non
self.lang_index = defaultdict(Enumerator())
else:
# pre-specified lang set
self.lang_index = dict((k,v) for v,k in enumerate(langs))
self.lang_index = {k: v for v,k in enumerate(langs)}

if domains is None:
self.domain_index = defaultdict(Enumerator())
else:
# pre-specified domain set
self.domain_index = dict((k,v) for v,k in enumerate(domains))
self.domain_index = {k: v for v,k in enumerate(domains)}

self.coverage_index = defaultdict(set)
self.items = list()
self.items = []
adbar marked this conversation as resolved.
Show resolved Hide resolved

if os.path.isdir(root):
# root supplied was the root of a directory structure
Expand Down Expand Up @@ -173,22 +173,25 @@ def index(self, candidates):

def prune_min_domain(self, min_domain):
# prune files for all languages that do not occur in at least min_domain

# Work out which languages to reject as they are not present in at least
# the required number of domains
lang_domain_count = defaultdict(int)
for langs in self.coverage_index.values():
for lang in langs:
lang_domain_count[lang] += 1
reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain)
reject_langs = {
l
for l in lang_domain_count if lang_domain_count[l] < min_domain
}

# Remove the languages from the indexer
if reject_langs:
#print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
reject_ids = set(self.lang_index[l] for l in reject_langs)
reject_ids = {self.lang_index[l] for l in reject_langs}

new_lang_index = defaultdict(Enumerator())
lm = dict()
lm = {}
adbar marked this conversation as resolved.
Show resolved Hide resolved
for k,v in self.lang_index.items():
if v not in reject_ids:
new_id = new_lang_index[k]
Expand Down
7 changes: 3 additions & 4 deletions langid/train/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def from_file(cls, path):
# tk_output is a mapping from state to a list of feature indices.
# because of the way the scanner class is written, it needs a mapping
# from state to the feature itself. We rebuild this here.
tk_output_f = dict( (k,[feats[i] for i in v]) for k,v in tk_output.iteritems() )
tk_output_f = {k: [feats[i] for i in v] for k,v in tk_output.iteritems()}
adbar marked this conversation as resolved.
Show resolved Hide resolved
scanner = cls.__new__(cls)
scanner.__setstate__((tk_nextmove, tk_output_f))
return scanner
Expand Down Expand Up @@ -173,8 +173,7 @@ def search(self, string):
state = 0
for letter in map(ord,string):
state = self.nm_arr[(state << 8) + letter]
for key in self.output.get(state, []):
yield key
yield from self.output.get(state, [])
adbar marked this conversation as resolved.
Show resolved Hide resolved

def build_scanner(features):
"""
Expand Down Expand Up @@ -209,7 +208,7 @@ def index(seq):
@param seq the sequence to index
@returns a dictionary from item to position in the sequence
"""
return dict((k,v) for (v,k) in enumerate(seq))
return {k: v for (v,k) in enumerate(seq)}
adbar marked this conversation as resolved.
Show resolved Hide resolved

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
34 changes: 8 additions & 26 deletions langid/train/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __call__(self, seq):
max_order = self.max_order
t = tee(seq, max_order)
for i in xrange(max_order):
for j in xrange(i):
for _ in xrange(i):
adbar marked this conversation as resolved.
Show resolved Hide resolved
# advance iterators, ignoring result
t[i].next()
while True:
Expand All @@ -91,7 +91,7 @@ def __call__(self, seq):
max_order = self.max_order
t = tee(_seq, max_order)
for i in xrange(max_order):
for j in xrange(i):
for _ in xrange(i):
adbar marked this conversation as resolved.
Show resolved Hide resolved
# advance iterators, ignoring result
t[i].next()
while True:
Expand All @@ -110,12 +110,9 @@ def cleanup():
if not complete:
for d in b_dirs:
shutil.rmtree(d)
except NameError:
except (NameError, OSError):
# Failed before globals defined, nothing to clean
pass
except OSError:
# sometimes we try to clean up files that are not there
pass
adbar marked this conversation as resolved.
Show resolved Hide resolved

def setup_pass_tokenize(tokenizer, b_dirs, sample_count, sample_size, term_freq, line_level):
global __tokenizer, __b_dirs, __sample_count, __sample_size, __term_freq, __line_level
Expand All @@ -137,7 +134,7 @@ def pass_tokenize(chunk_items):
than by document.
"""
global __maxorder, __b_dirs, __tokenizer, __sample_count, __sample_size, __term_freq, __line_level

adbar marked this conversation as resolved.
Show resolved Hide resolved
extractor = __tokenizer
term_lng_freq = defaultdict(lambda: defaultdict(int))
term_dom_freq = defaultdict(lambda: defaultdict(int))
Expand All @@ -152,38 +149,23 @@ def pass_tokenize(chunk_items):
offsets = random.sample(xrange(poss), count)
for offset in offsets:
tokens = extractor(text[offset: offset+__sample_size])
if args.__term_freq:
# Term Frequency
tokenset = Counter(tokens)
else:
# Document Frequency
tokenset = Counter(set(tokens))
tokenset = Counter(tokens) if args.__term_freq else Counter(set(tokens))
for token, count in tokenset.iteritems():
term_lng_freq[token][lang_id] += count
term_dom_freq[token][domain_id] += count
elif __line_level:
# line-model - each line in a file should be interpreted as a document
for line in f:
tokens = extractor(line)
if __term_freq:
# Term Frequency
tokenset = Counter(tokens)
else:
# Document Frequency
tokenset = Counter(set(tokens))
tokenset = Counter(tokens) if __term_freq else Counter(set(tokens))
for token, count in tokenset.iteritems():
term_lng_freq[token][lang_id] += count
term_dom_freq[token][domain_id] += count

else:
# whole-document tokenization
tokens = extractor(f.read())
if __term_freq:
# Term Frequency
tokenset = Counter(tokens)
else:
# Document Frequency
tokenset = Counter(set(tokens))
tokenset = Counter(tokens) if __term_freq else Counter(set(tokens))
for token, count in tokenset.iteritems():
term_lng_freq[token][lang_id] += count
term_dom_freq[token][domain_id] += count
Expand Down