# Parameter tuning

In [3]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import segmenter as seg

In [2]:
# Load data
s = seg.Segmenter()

X_train = s.X_train[::10]
X_test = s.X_test[::10]
y_train = s.y_train[::10]
y_test = s.y_test[::10]

In [3]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# Use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='macro')

# Search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

In [4]:
rs.fit(s.X_train, s.y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 233.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022AA29C5BA8>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022AA29C5C88>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=macro), verbose=1)

In [5]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.7646475742048033, 'c2': 0.014400734610933441}
best CV score: 0.7782539287205765
model size: 0.95M


In [6]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

_x = [x['c1'] for x in rs.cv_results_['params']]
_y = [x['c2'] for x in rs.cv_results_['params']]
_c = rs.cv_results_['mean_test_score']

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

Dark blue => 0.7584, dark red => 0.7783


In [7]:
crf = rs.best_estimator_
y_pred = crf.predict(s.X_test)
print(metrics.flat_classification_report(
    s.y_test, y_pred, labels=['P', 'S'], digits=3
))

              precision    recall  f1-score   support

           P      0.762     0.482     0.591      5734
           S      0.986     0.996     0.991    213049

   micro avg      0.982     0.982     0.982    218783
   macro avg      0.874     0.739     0.791    218783
weighted avg      0.980     0.982     0.981    218783



In [8]:
from collections import Counter

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
14.142314 S        EOS
11.331954 P        +1:word.lower=.
6.964167 P        +1:word.lower=shotgun-type
6.707807 P        -1:word.lower=fads
6.381126 P        -1:word.lower=peptides
6.351835 P        -1:word.lower=wade
6.319168 P        word.lower=crankshaft
6.058116 P        +1:word.lower=restudy
5.361333 P        -1:word.lower=3:20
5.084530 P        +1:word.lower=criticality
5.067970 S        +1:word.lower=?
4.886310 P        -1:word.lower=crumbling
4.879298 P        +1:word.lower=syllabification
4.810751 P        postag=.
4.564678 P        -1:word.lower=wised
4.534484 P        -1:word.lower=strictest
4.506029 P        word.lower=sapling
4.488560 S        +1:word.lower=b.c.
4.426212 P        word.lower=hinterlands
4.412403 P        word.lower=christy
4.393101 P        word.lower=ponchartrain
4.391096 P        word.lower=cotman
4.389398 P        word.lower=skyjacked
4.388058 P        word.lower=unchanging
4.325175 S        +1:word.lower=r
4.316314 P        word.lower=4000

# Picking sentences

In [4]:
import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer

mdetok = MosesDetokenizer()

sents = brown.sents(categories=['adventure', 'fiction', 'hobbies',
'humor', 'mystery', 'news', 'religion', 'reviews', 'romance',
'science_fiction'])[:-1]

brown_natural = [mdetok.detokenize(' '.join(sent).replace('``', '”').replace("''", '”').replace('`', "”").replace("'", '”').split(), return_str=True)  for sent in sents]

data = []
for i in range(0, len(brown_natural), 2):
    a = brown_natural[i][:-1]
    b = brown_natural[i + 1][0].lower() + brown_natural[i + 1][1:-1]
    # index of 'P' in the original sents
    ind = len(sents[i]) - 2
    data.append((a + " " + b, ind))

In [6]:
import segmenter
pred = segmenter.Segmenter.predict(data[3009][0])
print(pred)
if pred.index('P') == data[3009][1]:
        print('Found: ', sent)

['P', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S']


In [8]:
# Compares the identified position of P with the correct position of P
import segmenter as seg

final_sents = []
for sent, ind in data:
    pred = seg.Segmenter.predict(sent)
    if 'P' in pred:
        if pred.index('P') == ind:
            final_sents.append(sent)

In [21]:
[sent.replace("”", "'").replace(" ' ", ' "').replace("' ", '"').replace(";", '') for sent in final_sents[2000:3200] if ('?' not in sent) and ('!' not in sent)]

['Mickey felt his shoulders come up against the wall beside the heavy slab front door this was going to be it now, any second, and what he had to remember was to keep his eye on the razor, no matter what, even if Roberts should feint with a kick to the groin, the deadly hand was his exclusive concern',
 'It moved in a silver arc toward his throat, then veered downward he hunched his left shoulder into it and slashed at Roberts"forearm with his own, felt the blade slide off his sleeve',
 'Ducking, Mickey tripped and fell to one side, landing heavily on the wood floor then Roberts was on him, gasping for breath and for a couple of seconds Mickey lost sight of the blade',
 'He felt it rip at the side of his jacket and a momentary sting under his left ribs he got a knee up into Roberts"belly, used both hands and heaved him clear, then scrambled to his feet',
 'The sound of his head striking the solid wood was an ultimate, sudden-end sound he fell on his side across the lowest step, rolled 

# N-Grams

In [143]:
import nltk
from nltk import ngrams
from nltk.corpus import brown
sents = brown.sents()

data = []
for s in sents:

    # Obtain the list of tokens in the document
    tokens = [t for t in s]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append(tagged)
data

[[('The', 'DT'),
  ('Fulton', 'NNP'),
  ('County', 'NNP'),
  ('Grand', 'NNP'),
  ('Jury', 'NNP'),
  ('said', 'VBD'),
  ('Friday', 'NNP'),
  ('an', 'DT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NNP'),
  ('recent', 'JJ'),
  ('primary', 'JJ'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'DT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'IN'),
  ('any', 'DT'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'DT'),
  ('jury', 'NN'),
  ('further', 'RB'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'JJ'),
  ('presentments', 'NNS'),
  ('that', 'IN'),
  ('the', 'DT'),
  ('City', 'NNP'),
  ('Executive', 'NNP'),
  ('Committee', 'NNP'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'VBD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'DT'),
  ('praise', 'NN'),
  ('and', 'CC'),
  ('than

In [144]:
ndata = data[::20]

### For POS Tags

In [147]:
def get_pos_tags(sent):
    return [y for x,y in sent]

tags = [get_pos_tags(sent) for sent in ndata]
tags

[['DT',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  'VBD',
  'NNP',
  'DT',
  'NN',
  'IN',
  'NNP',
  'JJ',
  'JJ',
  'NN',
  'VBD',
  '``',
  'DT',
  'NN',
  "''",
  'IN',
  'DT',
  'NNS',
  'VBD',
  'NN',
  '.'],
 ['DT',
  'NN',
  'VBD',
  'PRP',
  'VBD',
  'DT',
  'NN',
  '``',
  'VBZ',
  'VBN',
  'IN',
  'PRP$',
  'NN',
  'VBZ',
  'DT',
  'NNS',
  "''",
  'IN',
  'CD',
  'JJ',
  'JJ',
  'NNS',
  ',',
  'DT',
  'NNP',
  'NNP',
  'NNP',
  'CC',
  'DT',
  'JJ',
  'NNS',
  'NN',
  '.'],
 ['NNP',
  'VBZ',
  'VBN',
  'NN',
  'IN',
  'NNP',
  ',',
  'IN',
  'NN',
  'IN',
  'CD',
  'NN',
  'NN',
  ',',
  'IN',
  'CD',
  '.'],
 ['NNP',
  'RB',
  'MD',
  'VB',
  'DT',
  'NNP',
  'CD',
  'NN',
  'NN',
  'NN',
  'VBD',
  'RBR',
  'IN',
  'DT',
  'NN',
  'IN',
  'PRP$',
  'JJ',
  'NN',
  'NN',
  '.'],
 ['PRP',
  'VBZ',
  'IN',
  '``',
  'IN',
  'DT',
  'NN',
  'NNP',
  'VBZ',
  'VB',
  'DT',
  'NN',
  'IN',
  'JJ',
  'NNS',
  "''",
  ',',
  'DT',
  'NNP',
  'NNP',
  'IN',
  'NNP',
  'MD',
  'VB',
  

In [54]:
from itertools import chain
#Create your 5-grams on 1/3 of data
grams = ngrams(tags[0], 5)
for sent in tags:
    grams = chain(grams, ngrams(sent, 5))
    
    
#compute frequency distribution for all the 5-grams in the tags
tags_fdist = nltk.FreqDist(grams)
#compute probability distribution for all the 5-grams in the tags
tags_pdist = nltk.MLEProbDist(tags_fdist)
tags_pdist

<MLEProbDist based on 312892 samples>

In [152]:
def get_perplexity(prob_dist, items):
    """ Returns the perplexity based on the probability distribution and the items to predict """
    grams = ngrams(items, 5)
    probs = [prob_dist.prob(x) for x in grams]
    
    product = 1
    for el in probs:
        if el != 0:
            product *= el
    if(len(probs) == 0):
        return 1
    return product ** (-1/len(probs))

In [142]:
test_sent = brown.sents()[0]
print(test_sent)
test_sent_tags = [get_pos_tags(sent) for sent in [nltk.pos_tag(test_sent)]][0]
print(get_perplexity(tags_pdist, tags[0]))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
74678.69859706283


In [155]:
test_probs = []
for sent_tags in tags[100:200:10]:
    sent_tags = sent_tags[:-1]
    
    no_period = get_perplexity(tags_pdist, sent_tags)
    
    mistakes = []
    for i in range(len(sent_tags)):
        mistakes.append(get_perplexity(tags_pdist, sent_tags[:i] + ['.'] + sent_tags[i:]))
        
    correct_period = get_perplexity(tags_pdist, sent_tags + ['.'])
    
    test_probs.append((no_period, mistakes, correct_period))
    
test_probs

  after removing the cwd from sys.path.


[(3366.7193997578784,
  [2748.063402745226,
   2146.5691650784443,
   1668.0482498841668,
   1280.5631406304421,
   1207.9389521094904,
   1192.211436249348,
   1324.0901146202398,
   1286.1416240983579,
   1102.3678233433075,
   1526.7909330124874,
   1121.6368734792572,
   1466.3230471480294,
   1197.6579857390761,
   1207.534373626993,
   941.2775463560959,
   982.0883830323196,
   1046.1002941113952,
   1518.8502832209401,
   1493.442271829444,
   1206.1719187878466,
   1535.1796497420466,
   1238.1294090038984,
   1556.639520970628,
   1760.4534369782857,
   1718.8115079816962,
   1970.9383816714594,
   2037.8285239425554,
   2037.8285239425554,
   2844.9736493543132,
   1537.567799338035,
   1120.596892195317,
   875.3214110221205,
   890.6217597776762,
   1168.4883368481912,
   1201.026011255584,
   1537.567799338035,
   2002.819849633808,
   2143.108547632496,
   1736.3230644492057,
   1419.8604846722687,
   1914.7178425901504,
   2498.1189524146894,
   2247.200831931416],
  32

In [None]:
test_probs = []
for sent_tags in tags[100:200:10]:
    sent_tags = sent_tags[:-1]
    
    no_period = get_perplexity(tags_pdist, sent_tags)
    
    mistakes = []
    for i in range(len(sent_tags)):
        mistakes.append(get_perplexity(tags_pdist, sent_tags[:i] + ['.'] + sent_tags[i:]))
        
    correct_period = get_perplexity(tags_pdist, sent_tags + ['.'])
    
    test_probs.append((no_period, mistakes, correct_period))
    
test_probs

In [136]:
test_sent = brown.sents()[4][:-6] + ["."] + brown.sents()[4][-6:-1]
print(test_sent)
test_sent_tags = [get_pos_tags(sent) for sent in [nltk.pos_tag(test_sent)]][0]
test_tags_ngrams = ngrams(test_sent_tags, 5)
probs = [tags_pdist.prob(x) for x in  test_tags_ngrams]
print(get_perplexity(probs))

['The', 'jury', 'said', 'it', 'did', 'find', 'that', 'many', 'of', "Georgia's", 'registration', 'and', 'election', 'laws', '``', 'are', 'outmoded', 'or', '.', 'inadequate', 'and', 'often', 'ambiguous', "''"]
19.667339525401513


In [134]:
test_sent = brown.sents()[4]
print(test_sent)
test_sent_tags = [get_pos_tags(sent) for sent in [nltk.pos_tag(test_sent)]][0]
test_tags_ngrams = ngrams(test_sent_tags, 5)
probs = [tags_pdist.prob(x) for x in  test_tags_ngrams]
print(get_perplexity(probs))

['The', 'jury', 'said', 'it', 'did', 'find', 'that', 'many', 'of', "Georgia's", 'registration', 'and', 'election', 'laws', '``', 'are', 'outmoded', 'or', 'inadequate', 'and', 'often', 'ambiguous', "''", '.']
37.02672821024691


# Language model
https://www.kaggle.com/alvations/n-gram-language-model-with-nltk

In [4]:
from nltk.corpus import brown
data = brown.sents()

In [5]:
from nltk.lm.preprocessing import padded_everygram_pipeline
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, brown.sents())

In [6]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 56060 items>


In [7]:
import pickle
with open('words_language.model', 'wb') as f:
    pickle.dump(model, f)

In [8]:
model.score('by', 'was won'.split())  # P('never'|'language is')

0.5

In [142]:
sent = brown.sents()[26]

In [143]:
from nltk import ngrams
padded_grams = ngrams(sent, n)
model.perplexity(padded_grams)

inf

# Language model (for POS tags)

In [9]:
import nltk
from nltk.corpus import brown
sents = brown.sents()

data = []
for s in sents:

    # Obtain the list of tokens in the document
    tokens = [t for t in s]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append(tagged)
data

[[('The', 'DT'),
  ('Fulton', 'NNP'),
  ('County', 'NNP'),
  ('Grand', 'NNP'),
  ('Jury', 'NNP'),
  ('said', 'VBD'),
  ('Friday', 'NNP'),
  ('an', 'DT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NNP'),
  ('recent', 'JJ'),
  ('primary', 'JJ'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'DT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'IN'),
  ('any', 'DT'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'DT'),
  ('jury', 'NN'),
  ('further', 'RB'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'JJ'),
  ('presentments', 'NNS'),
  ('that', 'IN'),
  ('the', 'DT'),
  ('City', 'NNP'),
  ('Executive', 'NNP'),
  ('Committee', 'NNP'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'VBD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'DT'),
  ('praise', 'NN'),
  ('and', 'CC'),
  ('than

In [10]:
def get_pos_tags(sent):
    return [y for x,y in sent]

tags = [get_pos_tags(sent) for sent in data]
tags

[['DT',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  'VBD',
  'NNP',
  'DT',
  'NN',
  'IN',
  'NNP',
  'JJ',
  'JJ',
  'NN',
  'VBD',
  '``',
  'DT',
  'NN',
  "''",
  'IN',
  'DT',
  'NNS',
  'VBD',
  'NN',
  '.'],
 ['DT',
  'NN',
  'RB',
  'VBD',
  'IN',
  'JJ',
  'NNS',
  'IN',
  'DT',
  'NNP',
  'NNP',
  'NNP',
  ',',
  'WDT',
  'VBD',
  'JJ',
  'NN',
  'IN',
  'DT',
  'NN',
  ',',
  '``',
  'VBZ',
  'DT',
  'NN',
  'CC',
  'NNS',
  'IN',
  'DT',
  'NNP',
  'IN',
  'NNP',
  "''",
  'IN',
  'DT',
  'NN',
  'IN',
  'WDT',
  'DT',
  'NN',
  'VBD',
  'VBN',
  '.'],
 ['DT',
  'NNP',
  'NN',
  'NN',
  'VBD',
  'VBN',
  'VBN',
  'IN',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  'TO',
  'VB',
  'NNS',
  'IN',
  'JJ',
  '``',
  'NNS',
  "''",
  'IN',
  'DT',
  'JJ',
  'NN',
  'WDT',
  'VBD',
  'VBN',
  'IN',
  'NNP',
  'NNP',
  'NNP',
  'NNP',
  '.'],
 ['``',
  'RB',
  'DT',
  'JJ',
  'NN',
  'IN',
  'JJ',
  'NNS',
  'VBD',
  'VBN',
  "''",
  ',',
  'DT',
  'NN',
  'VBD',
  ',',
  '``

In [11]:
from nltk.lm.preprocessing import padded_everygram_pipeline
# Preprocess the tokenized text for 3-grams language modelling
n = 5
train_data, padded_sents = padded_everygram_pipeline(n, tags)

In [12]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 47 items>


In [13]:
import pickle
with open('pos_tags_language.model', 'wb') as f:
    pickle.dump(model, f)

In [14]:
sent_tags = get_pos_tags(nltk.pos_tag(brown.sents()[26]))[:-1]

In [15]:
from nltk import ngrams
padded_grams = ngrams(sent_tags, n)
model.perplexity(padded_grams)

5.711725613707041

In [None]:
def is_perplexity_decreasing(items):
    """ Returns true if per-word perplexity of the sentence decreases if a period is inserted in front of the items """
    arr = []
    for i in (items, items + ['.']):
        padded_grams = ngrams(i, n)
        arr.append(model.perplexity(padded_grams))
    return arr[0] > arr[1]