In [138]:
import nltk
import pprint

In [8]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [10]:
print(tagged_sentences[1])

[('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]


In [12]:
def features(sentence, index):
    return{
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [15]:
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 3))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': True,
 'is_numeric': False,
 'next_word': '',
 'prefix-1': 's',
 'prefix-2': 'se',
 'prefix-3': 'sen',
 'prev_word': 'a',
 'suffix-1': 'e',
 'suffix-2': 'ce',
 'suffix-3': 'nce',
 'word': 'sentence'}


In [16]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [17]:
cutoff = int(0.70 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [18]:
print(len(training_sentences))
print(len(test_sentences))

2739
1175


In [19]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    #X contains list of features, y contains the original tagging
    
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
            
    return X, y

In [20]:
X, y = transform_to_dataset(training_sentences)

In [21]:
##Now we need to train classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
from sklearn.pipeline import Pipeline

In [25]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

In [27]:
clf.fit(X[:10000], y[:10000])

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [28]:
X_test, y_test = transform_to_dataset(test_sentences)

In [30]:
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.892704007563


In [31]:
def pos_tag_predict(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return list(zip(sentence, tags))

In [34]:
print(pos_tag_predict(word_tokenize('Shhe sells seashells on the seashore')))

[('Shhe', 'PRP'), ('sells', 'VBZ'), ('seashells', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('seashore', 'VBP')]


In [35]:
print(test_sentences)

[[('And', 'CC'), ('the', 'DT'), ('practice', 'NN'), ('should', 'MD'), ("n't", 'RB'), ('be', 'VB'), ('stopped', 'VBN'), ('*-1', '-NONE-'), (',', ','), ('he', 'PRP'), ('says', 'VBZ'), ('0', '-NONE-'), ('*T*-2', '-NONE-'), (',', ','), ('because', 'IN'), ('``', '``'), ('even', 'RB'), ('big', 'JJ'), ('players', 'NNS'), ('are', 'VBP'), ("n't", 'RB'), ('immune', 'JJ'), ('to', 'TO'), ('the', 'DT'), ('rigors', 'NNS'), ('of', 'IN'), ('program', 'NN'), ('trading', 'NN'), ('.', '.'), ("''", "''")], [('*-2', '-NONE-'), ('Also', 'RB'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), (',', ','), ('Israel', 'NNP'), ('Silverman', 'NNP'), (',', ','), ('an', 'DT'), ('insurance-company', 'NN'), ('lawyer', 'NN'), (',', ','), ('comments', 'VBZ'), ('that', 'IN'), ('program', 'NN'), ('trading', 'NN'), ('``', '``'), ('increases', 'VBZ'), ('volatility', 'NN'), (',', ','), ('but', 'CC'), ('I', 'PRP'), ('do', 'VBP'), ("n't", 'RB'), ('think', 'VB'), ('0', '-NONE-'), ('it', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('

In [36]:
theseSentences = []
for a in test_sentences:
    theseSentTemp = [w for w, l in a]
    theseSentJoin = ' '.join(word for word in theseSentTemp)
    theseSentences.append(theseSentJoin)

In [62]:
predictSentence = []
for sentence in theseSentences:
    predictSentence.append(pos_tag_predict(word_tokenize(sentence)))

In [105]:
idx = []
for i in range(len(predictSentence)):
    if(len(predictSentence[i]) != len(test_sentences[i])):
        idx.append(i)

In [106]:
idx

[34,
 35,
 38,
 44,
 70,
 150,
 151,
 157,
 164,
 173,
 189,
 425,
 462,
 466,
 468,
 469,
 476,
 486,
 489,
 514,
 520,
 648,
 649,
 653,
 655,
 844,
 930,
 961,
 1159,
 1166,
 1167]

In [107]:
predictSentence[35]

[('But', 'CC'),
 ('``', '``'),
 ('it', 'PRP'),
 ('wo', 'NN'),
 ("n't", 'RB'),
 ('lead', 'VB'),
 ('to', 'TO'),
 ('imminent', 'NN'),
 ('use', 'NN'),
 ('``', '``'),
 ('of', 'IN'),
 ('new', 'JJ'),
 ('superconductors', 'NNS'),
 (',', ','),
 ('cautioned', 'VBD'),
 ('0', '-NONE-'),
 ('*T*-1', '-NONE-'),
 ('Robert', 'NNP'),
 ('B.', 'NNP'),
 ('van', 'NN'),
 ('Dover', 'NNP'),
 (',', ','),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('AT', 'WDT'),
 ('&', 'CC'),
 ('T', 'NNP'),
 ('researchers', 'NNS'),
 ('.', '.')]

In [108]:
test_sentences[35]

[('But', 'CC'),
 ('``', '``'),
 ('it', 'PRP'),
 ('wo', 'MD'),
 ("n't", 'RB'),
 ('lead', 'VB'),
 ('to', 'TO'),
 ('imminent', 'JJ'),
 ('use', 'NN'),
 ("''", "''"),
 ('of', 'IN'),
 ('new', 'JJ'),
 ('superconductors', 'NNS'),
 (',', ','),
 ('cautioned', 'VBD'),
 ('0', '-NONE-'),
 ('*T*-1', '-NONE-'),
 ('Robert', 'NNP'),
 ('B.', 'NNP'),
 ('van', 'NNP'),
 ('Dover', 'NNP'),
 (',', ','),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('AT&T', 'NNP'),
 ('researchers', 'NNS'),
 ('.', '.')]

In [132]:
def word_accuracy(predict, test):
    count = 0
    ptags = []
    ttags = []
    for i in range(len(predict)): # numer of sentences
        if (len(predict[i]) == len(test[i])): # length of sentence
            for j in range(len(predict[i])):
                if(predict[i][j][0] == test[i][j][0]):
                    if(predict[i][j][1] == test[i][j][1]):
                        count = count + 1
            
        elif(len(predict[i]) > len(test[i])):
            for j in range(len(predict[i])):
                for k in range(len(test[i])):
                    if(predict[i][j][0] == test[i][k][0]):
                        if(predict[i][j][1] == test[i][k][1]):
                            count = count + 1
                            j = j + 1
                            if(j > len(predict[i])):
                                break
            
            
    #for p_sent1 in predict:
    #    for tups in p_sent1:
    #        ptags.append(tups[1])
    #        #ptags = [n for tup in p_sent1 for m, n in tup]
    #for t_sent1 in test:
    #    for tups in t_sent1:
    #        ttags.append(tups[1])
            #ttags = [n  for m, n in tup]
    #print(len(ptags))
    #print(len(ttags))
    #for i in range(len(ptags)):
     #   if(ptags[i] == ttags[i]):
      #      count = count + 1
    print(count)

In [None]:
## WORD ACCURACY

In [139]:
def check_length(predict, test):
    for i in range(len(predict)):
        if(len(predict[i]) > len(test[i])):
            print("Predict longer than test: ", i)
        elif(len(predict[i]) > len(test[i])):
            print("Test longer than predict: ", i)

In [140]:
check_length(predictSentence, test_sentences)

Predict longer than test:  34
Predict longer than test:  35
Predict longer than test:  38
Predict longer than test:  44
Predict longer than test:  70
Predict longer than test:  150
Predict longer than test:  151
Predict longer than test:  157
Predict longer than test:  164
Predict longer than test:  173
Predict longer than test:  189
Predict longer than test:  425
Predict longer than test:  462
Predict longer than test:  466
Predict longer than test:  468
Predict longer than test:  469
Predict longer than test:  476
Predict longer than test:  486
Predict longer than test:  489
Predict longer than test:  514
Predict longer than test:  520
Predict longer than test:  648
Predict longer than test:  649
Predict longer than test:  653
Predict longer than test:  655
Predict longer than test:  844
Predict longer than test:  930
Predict longer than test:  961
Predict longer than test:  1159
Predict longer than test:  1166
Predict longer than test:  1167


In [146]:
def check_word_sim(predict, test):
    count = 0
    for i in range(len(predict)):
        pred_list = []
        test_list = []
        pred_list = untag(predict[i])
        test_list = untag(test[i])
        for j in test_list:
            if(j in pred_list):
                if(predict[i][pred_list.index(j)][1] == test[i][test_list.index(j)][1]):
                    count = count + 1
    return count

In [137]:
check_sim(predictSentence, test_sentences)

26257

In [None]:
def test_size(test):
    return len([w for sent in test for w in sent])

In [142]:
print(test_size(test_sentences))

29619


In [147]:
print(check_word_sim(predictSentence, test_sentences) / test_size(test_sentences))

0.8864917789256896


In [144]:
## SENTENCE SIMILARITY

In [155]:
def check_sent_sim(predict, test):
    count = 0
    i = 0
    while(i < len(predict)):
        if(len(predict[i]) == len(test[i])):
            if(predict[i] == test[i]):
                count = count + 1
        
        pred_list = []
        test_list = []
        pred_list = untag(predict[i])
        test_list = untag(test[i])
        temp = 0
        for j in test_list:
            if(not (j in pred_list)):
                break
            elif(j in pred_list):
                if(predict[i][pred_list.index(j)][1] == test[i][test_list.index(j)][1]):
                    temp = temp + 1
                if(temp == len(test[i])):
                    count = count + 1
        i = i + 1
    return count

In [157]:
check_sent_sim(predictSentence, test_sentences) / len(predictSentence)

0.2

1175

29619
