In [1]:
import string
from nltk.stem.snowball import SnowballStemmer
 

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))
    
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
    isNumeric = word.isdigit()
 
    firstcaps = word == word.capitalize()
    allcaps = word == word.upper()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev-iob': previob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
        
        'isNumeric': isNumeric,
 
        'all-caps': allcaps,
        'first-caps': firstcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
        'tags_since_dt' : tags_since_dt(tokens, index)
    }
 

In [2]:
import pickle
from collections import Iterable
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
 
 
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger( train=train_sents, feature_detector=features, **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)

In [3]:
import os
import collections
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
import string
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag, word_tokenize

  
from nltk.corpus import ConllCorpusReader
my_corpus = ConllCorpusReader('.\dataset', '.*\.txt', columntypes=('words', 'pos','chunk'), encoding="utf-8")

type(list(my_corpus.iob_sents('2.txt')))


sents = list(my_corpus.iob_sents())

sentences = []

for sent in sents:
    reader = [((word,tag),iob) for word,tag,iob in sent]
    sentences.append(reader)


print (sentences[:1])



OSError: No such file or directory: 'C:\\Users\\kashyapak\\Documents\\ryerson\\ds8008\\project\\Project-NER\\dataset'

In [48]:
training_samples = sentences[:1]
test_samples = sentences[:2]



In [68]:
chunker = NamedEntityChunker(training_samples)

In [69]:
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples])


In [70]:
print(score.accuracy())

0.926829268292683


In [24]:
word = "2131231"

print(type(word))

allcaps = word == word.capitalize()

print (allcaps, word.capitalize(), word)

word.capitalize()

word.isdigit()

<class 'str'>
True 2131231 2131231


True