From 4ee2a2e59045350b2d2b051712f308b6558e5e35 Mon Sep 17 00:00:00 2001 From: Piotr Kasprzyk Date: Mon, 11 Apr 2016 11:22:10 +0200 Subject: [PATCH 1/2] pep8 + reformatting on pl196x.py --- nltk/corpus/reader/pl196x.py | 517 ++++++++++++++++++----------------- 1 file changed, 264 insertions(+), 253 deletions(-) diff --git a/nltk/corpus/reader/pl196x.py b/nltk/corpus/reader/pl196x.py index e782785a67..881d68235d 100644 --- a/nltk/corpus/reader/pl196x.py +++ b/nltk/corpus/reader/pl196x.py @@ -5,274 +5,285 @@ # URL: # For license information, see LICENSE.TXT -import os -import re - -from nltk import compat -from nltk import tokenize, tree - -from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import XMLCorpusReader -# (?:something) -- non-capturing parentheses! PARA = re.compile(r']*){0,1}>(.*?)

') SENT = re.compile(r']*){0,1}>(.*?)') TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)') -WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)') +WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)') TYPE = re.compile(r'type="(.*?)"') -ANA = re.compile(r'ana="(.*?)"') +ANA = re.compile(r'ana="(.*?)"') TEXTID = re.compile(r'text id="(.*?)"') class TEICorpusView(StreamBackedCorpusView): - def __init__(self, corpus_file, - tagged, group_by_sent, group_by_para, - tagset=None, headLen=0, textids=None): - self._tagged = tagged - self._textids = textids - - self._group_by_sent = group_by_sent - self._group_by_para = group_by_para - # WARNING -- skip header - StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen) - - _pagesize = 4096 - - def read_block(self, stream): - block = stream.readlines(self._pagesize) - block = concat(block) - while (block.count(' block.count('')) \ - or block.count('')+len('') - block = block[ :beg]+block[beg+end: ] - - output = [] - for para_str in PARA.findall(block): - para = [] - for sent_str in SENT.findall(para_str): - if not self._tagged: - sent = WORD.findall(sent_str) - else: - sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str))) - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - if self._group_by_para: - output.append(para) - else: - output.extend(para) - return output - - def _parse_tag(self, tag_word_tuple): - (tag, word) = tag_word_tuple - if tag.startswith('w'): - tag = ANA.search(tag).group(1) - else: # tag.startswith('c') - tag = TYPE.search(tag).group(1) - return (word, tag) + def __init__(self, corpus_file, + tagged, group_by_sent, group_by_para, + tagset=None, head_len=0, textids=None): + + self._tagged = tagged + self._textids = textids + + self._group_by_sent = group_by_sent + self._group_by_para = group_by_para + # WARNING -- skip header + StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len) + + _pagesize = 4096 + + def read_block(self, stream): + block = stream.readlines(self._pagesize) + block = concat(block) + while (block.count(' block.count('')) \ + or block.count('') + len('') + block = block[:beg] + block[beg + end:] + + output = [] + for para_str in PARA.findall(block): + para = [] + for sent_str in SENT.findall(para_str): + if not self._tagged: + sent = WORD.findall(sent_str) + else: + sent = list( + map(self._parse_tag, TAGGEDWORD.findall(sent_str))) + if self._group_by_sent: + para.append(sent) + else: + para.extend(sent) + if self._group_by_para: + output.append(para) + else: + output.extend(para) + return output + + def _parse_tag(self, tag_word_tuple): + (tag, word) = tag_word_tuple + if tag.startswith('w'): + tag = ANA.search(tag).group(1) + else: # tag.startswith('c') + tag = TYPE.search(tag).group(1) + return word, tag class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader): - - headLen = 2770 - - def __init__(self, *args, **kwargs): - if 'textid_file' in kwargs: self._textids = kwargs['textid_file'] - else: self._textids = None - - XMLCorpusReader.__init__(self, *args) - CategorizedCorpusReader.__init__(self, kwargs) - - self._init_textids() - - def _init_textids(self): - self._f2t = defaultdict(list) - self._t2f = defaultdict(list) - if self._textids is not None: - for line in self.open(self._textids).readlines(): - line = line.strip() - file_id, text_ids = line.split(' ', 1) - if file_id not in self.fileids(): - raise ValueError('In text_id mapping file %s: %s ' - 'not found' % (catfile, file_id)) - for text_id in text_ids.split(self._delimiter): - self._add_textids(file_id, text_id) - - def _add_textids(self, file_id, text_id): - self._f2t[file_id].append(text_id) - self._t2f[text_id].append(file_id) - - def _resolve(self, fileids, categories, textids=None): - tmp = None - if fileids is not None: - if not tmp: - tmp = fileids, None - else: - raise ValueError('Specify only fileids, categories or textids') - if categories is not None: - if not tmp: - tmp = self.fileids(categories), None - else: - raise ValueError('Specify only fileids, categories or textids') - if textids is not None: - if not tmp: - if isinstance(textids, compat.string_types): textids = [textids] - files = sum((self._t2f[t] for t in textids), []) - tdict = dict() - for f in files: - tdict[f] = (set(self._f2t[f]) & set(textids)) - tmp = files, tdict - else: - raise ValueError('Specify only fileids, categories or textids') - return None, None - - def decode_tag(self, tag): - # to be implemented - return tag - - def textids(self, fileids=None, categories=None): - """ - In the pl196x corpus each category is stored in single - file and thus both methods provide identical functionality. In order - to accommodate finer granularity, a non-standard textids() method was - implemented. All the main functions can be supplied with a list - of required chunks---giving much more control to the user. - """ - fileids, _ = self._resolve(fileids, categories) - if fileids is None: return sorted(self._t2f) - - if isinstance(fileids, compat.string_types): - fileids = [fileids] - return sorted(sum((self._f2t[d] for d in fileids), [])) - - def words(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - False, False, False, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - False, False, False, - headLen=self.headLen) - for fileid in fileids]) - - def sents(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - False, True, False, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - False, True, False, - headLen=self.headLen) - for fileid in fileids]) - - def paras(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - False, True, True, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - False, True, True, - headLen=self.headLen) - for fileid in fileids]) - - def tagged_words(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - True, False, False, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - True, False, False, - headLen=self.headLen) - for fileid in fileids]) - - def tagged_sents(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - True, True, False, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - True, True, False, - headLen=self.headLen) - for fileid in fileids]) - - def tagged_paras(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - - if textids: - return concat([TEICorpusView(self.abspath(fileid), - True, True, True, - headLen=self.headLen, - textids=textids[fileid]) - for fileid in fileids]) - else: - return concat([TEICorpusView(self.abspath(fileid), - True, True, True, - headLen=self.headLen) - for fileid in fileids]) - - def xml(self, fileids=None, categories=None): - fileids, _ = self._resolve(fileids, categories) - if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) - else: raise TypeError('Expected a single file') - - def raw(self, fileids=None, categories=None): - fileids, _ = self._resolve(fileids, categories) - if fileids is None: fileids = self._fileids - elif isinstance(fileids, compat.string_types): fileids = [fileids] - return concat([self.open(f).read() for f in fileids]) - + head_len = 2770 + + def __init__(self, *args, **kwargs): + if 'textid_file' in kwargs: + self._textids = kwargs['textid_file'] + else: + self._textids = None + + XMLCorpusReader.__init__(self, *args) + CategorizedCorpusReader.__init__(self, kwargs) + + self._init_textids() + + def _init_textids(self): + self._f2t = defaultdict(list) + self._t2f = defaultdict(list) + if self._textids is not None: + for line in self.open(self._textids).readlines(): + line = line.strip() + file_id, text_ids = line.split(' ', 1) + if file_id not in self.fileids(): + raise ValueError('In text_id mapping file %s: %s ' + 'not found' % (catfile, file_id)) + for text_id in text_ids.split(self._delimiter): + self._add_textids(file_id, text_id) + + def _add_textids(self, file_id, text_id): + self._f2t[file_id].append(text_id) + self._t2f[text_id].append(file_id) + + def _resolve(self, fileids, categories, textids=None): + tmp = None + if fileids is not None: + if not tmp: + tmp = fileids, None + else: + raise ValueError('Specify only fileids, categories or textids') + if categories is not None: + if not tmp: + tmp = self.fileids(categories), None + else: + raise ValueError('Specify only fileids, categories or textids') + if textids is not None: + if not tmp: + if isinstance(textids, compat.string_types): + textids = [textids] + files = sum((self._t2f[t] for t in textids), []) + tdict = dict() + for f in files: + tdict[f] = (set(self._f2t[f]) & set(textids)) + tmp = files, tdict + else: + raise ValueError('Specify only fileids, categories or textids') + return None, None + + def decode_tag(self, tag): + # to be implemented + return tag + + def textids(self, fileids=None, categories=None): + """ + In the pl196x corpus each category is stored in single + file and thus both methods provide identical functionality. In order + to accommodate finer granularity, a non-standard textids() method was + implemented. All the main functions can be supplied with a list + of required chunks---giving much more control to the user. + """ + fileids, _ = self._resolve(fileids, categories) + if fileids is None: return sorted(self._t2f) + + if isinstance(fileids, compat.string_types): + fileids = [fileids] + return sorted(sum((self._f2t[d] for d in fileids), [])) + + def words(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + False, False, False, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + False, False, False, + head_len=self.head_len) + for fileid in fileids]) + + def sents(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + False, True, False, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + False, True, False, + head_len=self.head_len) + for fileid in fileids]) + + def paras(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + False, True, True, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + False, True, True, + head_len=self.head_len) + for fileid in fileids]) + + def tagged_words(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + True, False, False, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + True, False, False, + head_len=self.head_len) + for fileid in fileids]) + + def tagged_sents(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + True, True, False, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + True, True, False, + head_len=self.head_len) + for fileid in fileids]) + + def tagged_paras(self, fileids=None, categories=None, textids=None): + fileids, textids = self._resolve(fileids, categories, textids) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + + if textids: + return concat([TEICorpusView(self.abspath(fileid), + True, True, True, + head_len=self.head_len, + textids=textids[fileid]) + for fileid in fileids]) + else: + return concat([TEICorpusView(self.abspath(fileid), + True, True, True, + head_len=self.head_len) + for fileid in fileids]) + + def xml(self, fileids=None, categories=None): + fileids, _ = self._resolve(fileids, categories) + if len(fileids) == 1: + return XMLCorpusReader.xml(self, fileids[0]) + else: + raise TypeError('Expected a single file') + + def raw(self, fileids=None, categories=None): + fileids, _ = self._resolve(fileids, categories) + if fileids is None: + fileids = self._fileids + elif isinstance(fileids, compat.string_types): + fileids = [fileids] + return concat([self.open(f).read() for f in fileids]) From dd878379b406ca80cac8bd9a7a014e2a7cb0baca Mon Sep 17 00:00:00 2001 From: Piotr Kasprzyk Date: Mon, 11 Apr 2016 11:35:33 +0200 Subject: [PATCH 2/2] fixed _resolve return value bug in pl196x.py --- nltk/corpus/reader/pl196x.py | 55 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/nltk/corpus/reader/pl196x.py b/nltk/corpus/reader/pl196x.py index 881d68235d..6abcabb046 100644 --- a/nltk/corpus/reader/pl196x.py +++ b/nltk/corpus/reader/pl196x.py @@ -102,14 +102,17 @@ def _init_textids(self): self._f2t = defaultdict(list) self._t2f = defaultdict(list) if self._textids is not None: - for line in self.open(self._textids).readlines(): - line = line.strip() - file_id, text_ids = line.split(' ', 1) - if file_id not in self.fileids(): - raise ValueError('In text_id mapping file %s: %s ' - 'not found' % (catfile, file_id)) - for text_id in text_ids.split(self._delimiter): - self._add_textids(file_id, text_id) + with open(self._textids) as fp: + for line in fp: + line = line.strip() + file_id, text_ids = line.split(' ', 1) + if file_id not in self.fileids(): + raise ValueError( + 'In text_id mapping file %s: %s not found' + % (self._textids, file_id) + ) + for text_id in text_ids.split(self._delimiter): + self._add_textids(file_id, text_id) def _add_textids(self, file_id, text_id): self._f2t[file_id].append(text_id) @@ -117,28 +120,26 @@ def _add_textids(self, file_id, text_id): def _resolve(self, fileids, categories, textids=None): tmp = None + if len(filter(lambda accessor: accessor is None, + (fileids, categories, textids))) != 1: + + raise ValueError('Specify exactly one of: fileids, ' + 'categories or textids') + if fileids is not None: - if not tmp: - tmp = fileids, None - else: - raise ValueError('Specify only fileids, categories or textids') + return fileids, None + if categories is not None: - if not tmp: - tmp = self.fileids(categories), None - else: - raise ValueError('Specify only fileids, categories or textids') + return self.fileids(categories), None + if textids is not None: - if not tmp: - if isinstance(textids, compat.string_types): - textids = [textids] - files = sum((self._t2f[t] for t in textids), []) - tdict = dict() - for f in files: - tdict[f] = (set(self._f2t[f]) & set(textids)) - tmp = files, tdict - else: - raise ValueError('Specify only fileids, categories or textids') - return None, None + if isinstance(textids, compat.string_types): + textids = [textids] + files = sum((self._t2f[t] for t in textids), []) + tdict = dict() + for f in files: + tdict[f] = (set(self._f2t[f]) & set(textids)) + return files, tdict def decode_tag(self, tag): # to be implemented