# Random fields

In [1]:
#pip install python-crfsuite
import json
from sklearn.model_selection import train_test_split
import pycrfsuite

In [2]:
def alternative_data_base(data):
    new_data = []
    for element in data:
        for key,value in element.items():
            list_temp = []
            for dico in value:
                for word in dico['text'].split(' '):
                    unite_dico = {'text' : word}
                    if 'entity' in dico.keys():
                        unite_dico['entity'] = dico['entity']
                    else :
                        unite_dico['entity'] = 'None'
                    list_temp.append(unite_dico)
        new_data.append(list_temp)
    return(new_data)

In [3]:
number = 0
list_query_type = ['AddToPlaylist','GetWeather','SearchCreativeWork' , 'BookRestaurant','PlayMusic','RateBook' ,'SearchScreeningEvent']
type_of_query = list_query_type[number]
data = json.load(open(type_of_query+ '.json'))[type_of_query]
data = alternative_data_base(data)

In [4]:
def word2features(query, i):
    word = query[i]['text']
    postag = query[i]['entity']

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a query
    if i > 0:
        word1 = query[i-1]['text']
        postag1 = query[i-1]['entity']
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a query'
        features.append('BOS')

    # Features for words that are not
    # at the end of a queryument
    if i < len(query)-1:
        word1 = query[i+1]['text']
        postag1 = query[i+1]['entity']
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a query'
        features.append('EOS')

    return features

In [5]:
# A function for extracting features in queries
def extract_features(query):
    return [word2features(query, i) for i in range(len(query))]

# A function fo generating the list of labels for each query
def get_entities(query):
    return [word['entity'] for word in query]

X = [extract_features(query) for query in data]
y = [get_entities(query) for query in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13025
Seconds required: 0.107

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 28150.133542
Feature norm: 1.000000
Error norm: 22280.396470
Active features: 12931
Line search trials: 1
Line search step: 0.000024
Seconds required for this iteration: 0.032

***** Iteration #2 *****
Loss: 23569.966351
Feature norm: 0.858279
Error norm: 11549.787167
Active features: 12612
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #3 *****
Loss: 20735.318671
Feature norm: 1.009736
Error norm: 11859.195920
Active features: 12679
Line search trials: 1
Line search step: 1.000000
Seconds required for

***** Iteration #43 *****
Loss: 13.518767
Feature norm: 22.638337
Error norm: 0.053938
Active features: 25
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #44 *****
Loss: 13.516715
Feature norm: 22.621378
Error norm: 0.076561
Active features: 23
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #45 *****
Loss: 13.514881
Feature norm: 22.621270
Error norm: 0.034436
Active features: 24
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.040

***** Iteration #46 *****
Loss: 13.514395
Feature norm: 22.638021
Error norm: 0.032458
Active features: 24
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #47 *****
Loss: 13.513496
Feature norm: 22.654278
Error norm: 0.028586
Active features: 23
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018



In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

please (None)
add (None)
a (None)
 (None)
track (music_item)
 (None)
by (None)
 (None)
david (artist)
freiberg (artist)
 (None)
to (None)
 (None)
my (playlist_owner)
 (None)
 (None)
workout (playlist)
 (None)
playlist. (None)
 (None)
