# Random fields

In [55]:
#pip install python-crfsuite

import json
from sklearn.model_selection import train_test_split
import pycrfsuite

In [78]:
def alternative_data_base(data):
    new_data = []
    for element in data:
        for key,value in element.items():
            list_temp = []
            for dico in value:
                for word in dico['text'].split(' '):
                    unite_dico = {'text' : word}
                    if 'entity' in dico.keys():
                        unite_dico['entity'] = dico['entity']
                    else :
                        unite_dico['entity'] = 'None'
                    list_temp.append(unite_dico)
        new_data.append(list_temp)
    return(new_data)

In [79]:
number = 0
list_of_query_type = ['AddToPlaylist','GetWeather','SearchCreativeWork' , 'BookRestaurant','PlayMusic','RateBook' ,'SearchScreeningEvent']
data = json.load(open(type_of_query+ '.json'))[type_of_query]
data = alternative_data_base(data)

In [58]:
def word2features(query, i):
    word = query[i]['text']
    postag = query[i]['entity']

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a query
    if i > 0:
        word1 = query[i-1]['text']
        postag1 = query[i-1]['entity']
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a query'
        features.append('BOS')

    # Features for words that are not
    # at the end of a queryument
    if i < len(query)-1:
        word1 = query[i+1]['text']
        postag1 = query[i+1]['entity']
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a query'
        features.append('EOS')

    return features

In [59]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_entities(doc):
    return [word['entity'] for word in doc]

X = [extract_features(doc) for doc in data]
y = [get_entities(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [60]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 12526
Seconds required: 0.105

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 26914.322589
Feature norm: 1.000000
Error norm: 21401.158349
Active features: 12456
Line search trials: 1
Line search step: 0.000025
Seconds required for this iteration: 0.034

***** Iteration #2 *****
Loss: 22498.832779
Feature norm: 0.858308
Error norm: 11057.721966
Active features: 12160
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #3 *****
Loss: 19787.897715
Feature norm: 1.011396
Error norm: 11354.113030
Active features: 12214
Line search trials: 1
Line search step: 1.000000
Seconds required for

***** Iteration #40 *****
Loss: 13.446306
Feature norm: 22.531052
Error norm: 0.103517
Active features: 23
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #41 *****
Loss: 13.438577
Feature norm: 22.607636
Error norm: 0.100381
Active features: 23
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #42 *****
Loss: 13.432686
Feature norm: 22.522645
Error norm: 0.047214
Active features: 23
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #43 *****
Loss: 13.430355
Feature norm: 22.540070
Error norm: 0.049359
Active features: 22
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #44 *****
Loss: 13.429315
Feature norm: 22.536902
Error norm: 0.038742
Active features: 22
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024



In [61]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

i (None)
want (None)
to (None)
add (None)
another (None)
 (None)
album (music_item)
 (None)
to (None)
the (None)
 (None)
wine (playlist)
& (playlist)
dine (playlist)
 (None)
playlist. (None)
 (None)
