In [4]:
import nltk

nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [5]:
from nltk.corpus import treebank

sentences = treebank.tagged_sents(tagset='universal')
sentences

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]

In [6]:
tags = set([tag for sentence in treebank.tagged_sents()
           for _, tag in sentence])
print('nb_tags: {} : {}'.format(len(tags), tags))

nb_tags: 46 : {'CC', 'WDT', 'VBZ', 'VBP', 'NNS', 'LS', 'NNP', 'DT', 'VBD', 'RB', '``', 'WRB', 'MD', 'EX', 'TO', 'PRP', 'VBN', '.', 'FW', 'WP$', 'IN', 'RBS', 'NNPS', 'CD', 'VBG', 'WP', 'JJ', 'VB', 'RP', '#', '-LRB-', ':', 'UH', '-NONE-', 'POS', 'NN', 'PDT', 'SYM', "''", 'PRP$', 'JJR', '$', 'JJS', '-RRB-', ',', 'RBR'}


In [12]:
train_test_cutoff = int(0.8 * len(sentences))
training_sentences = sentences[:train_test_cutoff]
test_sentences = sentences[train_test_cutoff:]

train_val_cutoff = int(0.75 * len(training_sentences))
training_sentences = training_sentences[:train_val_cutoff]
validation_sentences = training_sentences[train_val_cutoff:]

In [13]:
def add_basic_features(sentence_terms, index):
    """ Compute some very basic word features.
        :param sentence_terms: [w1, w2, ...] 
        :type sentence_terms: list
        :param index: the index of the word 
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    term = sentence_terms[index]
    return {
        'nb_terms': len(sentence_terms),
        'term': term,
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word': '' if index == 0 else sentence_terms[index - 1],
        'next_word': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1]
    }

In [14]:
def untag(tagged_sentence):
    """ 
    Remove the tag for each tagged term.
    :param tagged_sentence: a POS tagged sentence
    :type tagged_sentence: list
    :return: a list of tags
    :rtype: list of strings
    """
    return [w for w, _ in tagged_sentence]

"""
Append each word with features. 
i.e. word = 'Pierre' =>  { 'term' : 'Pierre', 'feature_1' : '', etc. }
"""
def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.
    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return: 
    """
    X, y = [], []
    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y

In [16]:
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)
X_val, y_val = transform_to_dataset(validation_sentences)

In [17]:
X_train

[{'nb_terms': 18,
  'term': 'Pierre',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': False,
  'prefix-1': 'P',
  'prefix-2': 'Pi',
  'prefix-3': 'Pie',
  'suffix-1': 'e',
  'suffix-2': 're',
  'suffix-3': 'rre',
  'prev_word': '',
  'next_word': 'Vinken'},
 {'nb_terms': 18,
  'term': 'Vinken',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': False,
  'prefix-1': 'V',
  'prefix-2': 'Vi',
  'prefix-3': 'Vin',
  'suffix-1': 'n',
  'suffix-2': 'en',
  'suffix-3': 'ken',
  'prev_word': 'Pierre',
  'next_word': ','},
 {'nb_terms': 18,
  'term': ',',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': True,
  'is_all_lower': True,
  'prefix-1': ',',
  'prefix-2': ',',
  'prefix-3': ',',
  'suffix-1': ',',
  'suffix-2': ',',
  'suffix-3': ',',
  'prev_word': 'Vinken',
  'next_word': '61'},
 {'nb_terms': 18,
  'term': '61',
  'is_first': False,

In [20]:
from sklearn.feature_extraction import DictVectorizer

# Fit dictvectorizer with our set of features.
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_train + X_test + X_val)

# Convert dict features to vectors
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)
X_val = dict_vectorizer.transform(X_val)

In [22]:
len(X_train)

61014

In [23]:
X_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [24]:
y_train

['NOUN',
 'NOUN',
 '.',
 'NUM',
 'NOUN',
 'ADJ',
 '.',
 'VERB',
 'VERB',
 'DET',
 'NOUN',
 'ADP',
 'DET',
 'ADJ',
 'NOUN',
 'NOUN',
 'NUM',
 '.',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'ADP',
 'NOUN',
 'NOUN',
 '.',
 'DET',
 'NOUN',
 'VERB',
 'NOUN',
 '.',
 'NOUN',
 'NOUN',
 '.',
 'NUM',
 'NOUN',
 'ADJ',
 'CONJ',
 'ADJ',
 'NOUN',
 'ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 '.',
 'VERB',
 'VERB',
 'X',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'DET',
 'ADJ',
 'ADJ',
 'NOUN',
 '.',
 'DET',
 'NOUN',
 'ADP',
 'NOUN',
 'ADV',
 'VERB',
 'X',
 'X',
 'PRT',
 'VERB',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'VERB',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'NOUN',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'ADP',
 'NOUN',
 'VERB',
 'X',
 'PRT',
 'PRON',
 'ADV',
 'ADP',
 'NUM',
 'NOUN',
 'ADP',
 '.',
 'NOUN',
 'VERB',
 'X',
 'X',
 '.',
 'DET',
 'NOUN',
 'NOUN',
 '.',
 'NOUN',
 '.',
 'VERB',
 'ADV',
 'ADJ',
 'ADP',
 'PRON',
 'VERB',
 'DET',
 'NOUN',
 '.',
 'ADP',
 'ADV',
 'ADJ',
 'NOUN',
 'PRT',
 'PRON',
 'VERB',
 'NOUN',
 '