In [1]:
%cd drive/My\ Drive/NLP

/content/drive/My Drive/NLP


In [2]:
%ls

corpus.txt  hi-ud-test.conllu  hi-ud-train.conllu  imdb.csv


In [3]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [4]:
def readfile(filepath, sep=','):
    with open(filepath, 'r') as f:
        lines = [line.rstrip() for line in f.readlines()[1:]]
    sentences = []
    sentence = []
    for line in lines:

        if line:
            word, tag = line.split(sep)[1], line.split(sep)[2]
        else:
            word, tag = None, None
    
        if not word:
            sentences.append(sentence)
            sentence = []
        else:
            sentence.append((word, tag))
  
    return sentences

In [5]:
train_sents = readfile('hi-ud-train.conllu', sep=',')
test_sents = readfile('hi-ud-test.conllu', sep='\t')

In [6]:
print(len(train_sents))
print(len(test_sents))

613
99


In [7]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),     
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
                
    return features

Features
1. Derived from current word -> word itself, word suffixes, is it uppercase, is it titlecase, is it a number
2. Derived from next word -> word itself, is it uppercase, is it title
3. Derived from prev word -> word itself, is it uppercase,
is it tile
4. If current word is the first word then 'EOS' = true
5. If current word is the end word then 'EOS' = false

In [8]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tokens(sent):
    return [token for token, postag in sent]

def sent2postags(sent):
    return [postag for token, postag in sent]

In [9]:
sent2features(train_sents[0])[0]

{'+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'esiya',
 'BOS': True,
 'bias': 1.0,
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': 'yaha',
 'word[-2:]': 'ha',
 'word[-3:]': 'aha'}

In [10]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2postags(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2postags(s) for s in test_sents]

CPU times: user 15.5 ms, sys: 5.91 ms, total: 21.5 ms
Wall time: 21.5 ms


In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1.33 s, sys: 5.68 ms, total: 1.34 s
Wall time: 1.34 s


In [12]:
labels = list(crf.classes_)
labels

['DET',
 'PROPN',
 'ADP',
 'ADV',
 'ADJ',
 'NOUN',
 'NUM',
 'AUX',
 'PUNCT',
 'PRON',
 'VERB',
 'CCONJ',
 'PART',
 'SCONJ',
 'X']

In [13]:
print("Train Scores")

y_pred = crf.predict(X_train)
print("F1-Score : {}".format(metrics.flat_f1_score(y_train, y_pred, 
                      average='weighted', labels=labels)))
print("Precision : {}".format(metrics.flat_precision_score(y_train, y_pred, 
                      average='weighted', labels=labels)))
print("Recall : {}".format(metrics.flat_recall_score(y_train, y_pred, 
                      average='weighted', labels=labels)))
print("Accuracy : {}".format(metrics.flat_accuracy_score(y_train, y_pred)))

Train Scores
F1-Score : 0.9961206850932236
Precision : 0.9961531627089814
Recall : 0.996125066809193
Accuracy : 0.996125066809193


In [14]:
print("Train Scores")

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_train, y_pred, labels=sorted_labels, digits=3
))

Train Scores
              precision    recall  f1-score   support

           X      1.000     1.000     1.000         2
        PART      1.000     0.994     0.997       163
       CCONJ      0.993     1.000     0.997       150
       SCONJ      0.984     1.000     0.992        61
         ADJ      0.998     0.998     0.998       569
         ADP      0.998     0.999     0.999      1384
         ADV      1.000     0.973     0.986       110
        VERB      0.998     0.981     0.990       639
         DET      0.996     0.996     0.996       230
        NOUN      0.994     0.999     0.997      1596
        PRON      0.998     0.995     0.997       430
       PROPN      1.000     0.992     0.996       707
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       563
         AUX      0.985     1.000     0.993       728

    accuracy                          0.996      7484
   macro avg      0.996     0.995     0.996      7484
weighted avg 

In [15]:
print("Test Scores")

y_pred = crf.predict(X_test)
print("F1-Score : {}".format(metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)))
print("Precision : {}".format(metrics.flat_precision_score(y_test, y_pred, 
                      average='weighted', labels=labels)))
print("Recall : {}".format(metrics.flat_recall_score(y_test, y_pred, 
                      average='weighted', labels=labels)))
print("Accuracy : {}".format(metrics.flat_accuracy_score(y_test, y_pred)))

Test Scores
F1-Score : 0.8408460437174989
Precision : 0.844214976334156
Recall : 0.8421052631578947
Accuracy : 0.8421052631578947


  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print("Test Scores")

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Test Scores
              precision    recall  f1-score   support

           X      0.000     0.000     0.000         0
        PART      0.938     0.909     0.923        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.750     1.000     0.857         3
         ADJ      0.643     0.787     0.708        94
         ADP      0.955     0.970     0.962       303
         ADV      0.600     0.429     0.500        21
        VERB      0.837     0.778     0.806        99
         DET      0.842     0.889     0.865        36
        NOUN      0.791     0.855     0.822       324
        PRON      0.797     0.846     0.821        65
       PROPN      0.639     0.542     0.586       144
         NUM      0.957     0.880     0.917        25
       PUNCT      1.000     0.828     0.906       134
         AUX      0.921     0.935     0.928       138

   micro avg      0.842     0.842     0.842      1444
   macro avg      0.778     0.777     0.773      1444
weighted avg  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
from collections import Counter

print("Train Sentences")
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Train Sentences
Top likely transitions:
VERB   -> AUX     4.005966
PROPN  -> PROPN   2.544374
AUX    -> AUX     2.297002
ADJ    -> NOUN    2.219601
PROPN  -> ADP     2.204116
PRON   -> ADP     1.958806
NUM    -> NOUN    1.869655
AUX    -> SCONJ   1.824453
NOUN   -> ADP     1.763550
VERB   -> SCONJ   1.743490

Top unlikely transitions:
AUX    -> ADP     -1.179227
ADV    -> AUX     -1.183643
PROPN  -> PART    -1.200343
PROPN  -> AUX     -1.251415
PROPN  -> DET     -1.259998
AUX    -> ADJ     -1.384617
CCONJ  -> AUX     -1.498200
DET    -> ADP     -1.911078
ADJ    -> PRON    -1.970602
ADJ    -> ADP     -2.037182


In [25]:
crf_test = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf_test.fit(X_test, y_test)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [27]:
from collections import Counter

print("Test Sentences")

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf_test.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf_test.transition_features_).most_common()[-10:])

Test Sentences
Top likely transitions:
VERB   -> AUX     3.492938
AUX    -> AUX     2.566234
PROPN  -> ADP     2.298786
PROPN  -> PROPN   2.229434
AUX    -> PUNCT   1.957406
NUM    -> NUM     1.665870
ADJ    -> NOUN    1.580236
PRON   -> ADP     1.502749
NUM    -> NOUN    1.469844
PART   -> NUM     1.430389

Top unlikely transitions:
VERB   -> PRON    -0.955955
PROPN  -> AUX     -1.062427
NOUN   -> NOUN    -1.101574
NOUN   -> DET     -1.148694
DET    -> PROPN   -1.203906
AUX    -> VERB    -1.227718
ADJ    -> ADP     -1.261130
ADP    -> PUNCT   -1.307790
VERB   -> PROPN   -1.319346
PROPN  -> ADJ     -1.367929
