Skip to content

Commit

Permalink
buildtagger
Browse files Browse the repository at this point in the history
  • Loading branch information
elishowk committed Mar 17, 2011
1 parent c8446be commit ff8bab1
Showing 1 changed file with 142 additions and 0 deletions.
142 changes: 142 additions & 0 deletions buildtagger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Copyright (C) 2010 elishowk
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


__author__="Elias Showk"

# warning : nltk imports it's own copy of pyyaml
import nltk.corpus
import nltk.tag
import itertools
import string
#from nltk.tag import brill
from nltk import pos_tag
import cPickle as pickle

import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s")

class SequentialPosTagger():
"""
integration of nltk Part Of Speech taggers
static posTag method implemets default MaxentClassifier algorithm tagger
needs nltk data 'taggers/maxent_treebank_pos_tagger/english.pickle'
otherwise, instance of this class provides a self trained and composite tagger
thanks to http://streamhacker.com/2010/04/12/pos-tag-nltk-brill-classifier/
needs nltk data 'corpus/brown' and 'corpus/conll2000'
"""
# default word patterns for the RegexpTagger
word_patterns = [
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'.*ould$', 'MD'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*ness$', 'NN'),
(r'.*ment$', 'NN'),
(r'.*ful$', 'JJ'),
(r'.*ious$', 'JJ'),
(r'.*ble$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*ive$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*est$', 'JJ'),
(r'^a$', 'PREP'),
(r'['+string.punctuation+']', 'PUN'),
]

def __init__(self, training_corpus_size, trained_pickle):
"""
Prepares the hybrid tagger
Composes training sentences
"""
if trained_pickle is not None:
self._loading(training_corpus_size, trained_pickle)
else:
self._training(training_corpus_size, trained_pickle)

def _loading(self, training_corpus_size, trained_pickle):
try:
self.tagger = pickle.load(file(trained_pickle,'r'))
except pickle.PickleError, pickerr:
_logger.warning("%s"%pickerr)
self._training(training_corpus_size, trained_pickle)
except IOError, ioerr:
_logger.warning("Pickled tagger %s not found, will train its own tagger"%trained_pickle)
self._training(training_corpus_size, trained_pickle)

def _training(self, training_corpus_size, trained_pickle):
# get the training sentences
_logger.debug( "Training and saving a new tagger" )
if training_corpus_size is None:
brown_train = list(nltk.corpus.brown.tagged_sents())
conll_train = list(nltk.corpus.conll2000.tagged_sents())
else:
brown_train = list(nltk.corpus.brown.tagged_sents()[:training_corpus_size])
conll_train = list(nltk.corpus.conll2000.tagged_sents()[:training_corpus_size])

train_sents = list(itertools.chain( brown_train, conll_train ))
# base tagger classes for initial tagger
tagger_classes = [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger]

backoff = nltk.tag.RegexpTagger(self.word_patterns)
self.tagger = self._backoff(train_sents, tagger_classes, backoff=backoff)
pickle.dump(self.tagger, file(trained_pickle,'w+'))


def _backoff(self, tagged_sents, tagger_classes, backoff=None):
"""Init backoff classes, takes a lot of time to train..."""
if not backoff:
backoff = tagger_classes[0](tagged_sents)
del tagger_classes[0]

for cls in tagger_classes:
tagger = cls(tagged_sents, backoff=backoff)
backoff = tagger

return backoff

@staticmethod
def posTag( tokens ):
"""
ALTERNATE TAGGER
gives access to the default nltk pos tagger (slower but smarter)
each tokens becomes ['token','TAG']
"""
return map( list, pos_tag( tokens ) )

def _normalize(self,tagtok):
"""returns an empty string when no tag was found"""
tagtok = list(tagtok)
if tagtok[1] is None:
tagtok[1] = '?'
return tagtok

def tag(self, tokens):
"""returns the tagged sentence using trained self.tagger"""
tag_tokens = map(
self._normalize,
self.tagger.tag( tokens )
)
return tag_tokens

@staticmethod
def getContent( sentence ):
"""return words from a tagged list like [["the","DET"],["python","NN"]]"""
return [tagged[0] for tagged in sentence]

@staticmethod
def getTag( sentence ):
"""return TAGS from a tagged list like [["the","DET"],["python","NN"]]"""
return [tagged[1] for tagged in sentence]

0 comments on commit ff8bab1

Please sign in to comment.