# Random fields

## Import and list data-bases

### Import packages

In [32]:
import sys, os, json, numpy as np, pycrfsuite
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Import functions and list data-bases

In [33]:
from utils.cleaning import alternative_data_base, extract_features, get_entities
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [64]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        self.alternative_data = alternative_data_base(data)
        X = [extract_features(query) for query in self.alternative_data]
        y = [get_entities(query) for query in self.alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(self.alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': 0.1,# L1 penalty
                            'c2': 0.01,# L2 penalty
                            'max_iterations': 400,
                            # whether to include transitions that
                            # are possible, but not observed
                            })
        trainer.train(self.model_file_path)
        
    def test_sample(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        self.y_pred = [tagger.tag(xseq) for xseq in self.X_test]
        i = 23
        for x, y in zip(self.y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))
    
    def accuracy_report(self):
        # Convert the sequences of tags into a 1-dimensional array
        predictions = np.array([tag for row in self.y_pred for tag in row])
        truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            truths, predictions
             ))

## Train and save a model for each type of query

In [65]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    random_fields.train_model()
    break

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'I'},
 {'entity': 'None', 'text': 'need'},
 {'entity': 'None', 'text': 'to'},
 {'entity': 'None', 'text': 'hear'},
 {'entity': 'None', 'text': 'the'},
 {'entity': 'music_item', 'text': 'song'},
 {'entity': 'track', 'text': 'Aspro'},
 {'entity': 'track', 'text': 'Mavro'},
 {'entity': 'None', 'text': 'from'},
 {'entity': 'artist', 'text': 'Bill'},
 {'entity': 'artist', 'text': 'Szymczyk'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Youtube'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=i',

***** Iteration #16 *****
Loss: 3231.384982
Feature norm: 27.002444
Error norm: 1127.669642
Active features: 8511
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #17 *****
Loss: 2866.108105
Feature norm: 29.852272
Error norm: 866.612085
Active features: 8486
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #18 *****
Loss: 2519.782761
Feature norm: 33.965618
Error norm: 996.580956
Active features: 8485
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #19 *****
Loss: 2194.066773
Feature norm: 37.917889
Error norm: 611.596180
Active features: 8483
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #20 *****
Loss: 1910.280552
Feature norm: 42.543286
Error norm: 589.825861
Active features: 8468
Line search trials: 1
Line search step: 1.000000
Seconds requi

***** Iteration #60 *****
Loss: 476.602752
Feature norm: 89.995106
Error norm: 32.858663
Active features: 2834
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #61 *****
Loss: 476.130770
Feature norm: 90.081322
Error norm: 46.841519
Active features: 2814
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #62 *****
Loss: 475.402194
Feature norm: 90.217441
Error norm: 37.448556
Active features: 2795
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #63 *****
Loss: 474.923124
Feature norm: 90.274712
Error norm: 47.014901
Active features: 2791
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #64 *****
Loss: 474.188688
Feature norm: 90.371797
Error norm: 33.753050
Active features: 2783
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #107 *****
Loss: 462.760822
Feature norm: 91.821932
Error norm: 22.700336
Active features: 2501
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #108 *****
Loss: 462.534926
Feature norm: 91.830966
Error norm: 20.209685
Active features: 2495
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #109 *****
Loss: 462.393812
Feature norm: 91.831857
Error norm: 25.659751
Active features: 2494
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #110 *****
Loss: 462.175932
Feature norm: 91.836243
Error norm: 25.498300
Active features: 2487
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #111 *****
Loss: 461.991844
Feature norm: 91.819150
Error norm: 25.360655
Active features: 2484
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #158 *****
Loss: 457.214867
Feature norm: 91.333330
Error norm: 16.899742
Active features: 2438
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #159 *****
Loss: 457.141811
Feature norm: 91.320083
Error norm: 12.390362
Active features: 2434
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #160 *****
Loss: 457.098646
Feature norm: 91.315392
Error norm: 17.366650
Active features: 2429
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #161 *****
Loss: 457.024622
Feature norm: 91.304556
Error norm: 13.289658
Active features: 2429
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #162 *****
Loss: 456.989170
Feature norm: 91.295846
Error norm: 18.656110
Active features: 2427
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #199 *****
Loss: 455.118835
Feature norm: 90.905161
Error norm: 13.299437
Active features: 2371
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #200 *****
Loss: 455.083017
Feature norm: 90.897246
Error norm: 15.355403
Active features: 2371
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #201 *****
Loss: 455.031060
Feature norm: 90.888056
Error norm: 11.496261
Active features: 2383
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #202 *****
Loss: 455.007708
Feature norm: 90.879551
Error norm: 15.481477
Active features: 2380
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #203 *****
Loss: 454.954340
Feature norm: 90.868665
Error norm: 10.284111
Active features: 2382
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #244 *****
Loss: 453.701635
Feature norm: 90.637228
Error norm: 11.607724
Active features: 2380
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #245 *****
Loss: 453.667810
Feature norm: 90.632579
Error norm: 8.137677
Active features: 2378
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #246 *****
Loss: 453.654262
Feature norm: 90.625690
Error norm: 11.630839
Active features: 2373
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #247 *****
Loss: 453.621172
Feature norm: 90.620708
Error norm: 8.367888
Active features: 2375
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #248 *****
Loss: 453.615947
Feature norm: 90.612496
Error norm: 12.986521
Active features: 2369
Line search trials: 1
Line search step: 1.000000
Seconds required for 

***** Iteration #289 *****
Loss: 452.678073
Feature norm: 90.429816
Error norm: 3.616594
Active features: 2339
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.038

***** Iteration #290 *****
Loss: 452.668540
Feature norm: 90.429819
Error norm: 8.198510
Active features: 2339
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.038

***** Iteration #291 *****
Loss: 452.646049
Feature norm: 90.426278
Error norm: 6.091339
Active features: 2333
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.034

***** Iteration #292 *****
Loss: 452.632182
Feature norm: 90.425406
Error norm: 8.372551
Active features: 2332
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.041

***** Iteration #293 *****
Loss: 452.608997
Feature norm: 90.421750
Error norm: 6.755934
Active features: 2328
Line search trials: 2
Line search step: 0.500000
Seconds required for thi

***** Iteration #333 *****
Loss: 452.016034
Feature norm: 90.234417
Error norm: 3.502035
Active features: 2287
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.028

***** Iteration #334 *****
Loss: 452.009633
Feature norm: 90.232100
Error norm: 6.028399
Active features: 2286
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.028

***** Iteration #335 *****
Loss: 451.994483
Feature norm: 90.228074
Error norm: 3.705907
Active features: 2284
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.028

***** Iteration #336 *****
Loss: 451.986258
Feature norm: 90.226471
Error norm: 5.226725
Active features: 2282
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.032

***** Iteration #337 *****
Loss: 451.975205
Feature norm: 90.223472
Error norm: 4.335367
Active features: 2283
Line search trials: 2
Line search step: 0.500000
Seconds required for thi

***** Iteration #379 *****
Loss: 451.614146
Feature norm: 90.188714
Error norm: 4.207852
Active features: 2264
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.040

***** Iteration #380 *****
Loss: 451.606896
Feature norm: 90.189493
Error norm: 4.478214
Active features: 2263
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #381 *****
Loss: 451.598934
Feature norm: 90.187547
Error norm: 3.882793
Active features: 2263
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.035

***** Iteration #382 *****
Loss: 451.592564
Feature norm: 90.187268
Error norm: 4.482218
Active features: 2264
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.031

***** Iteration #383 *****
Loss: 451.585004
Feature norm: 90.185907
Error norm: 3.810674
Active features: 2264
Line search trials: 2
Line search step: 0.500000
Seconds required for thi

In [66]:
random_fields.test_sample()
random_fields.accuracy_report()

play (None)
street (artist)
dad (artist)
from (None)
hiromitsu (artist)
agatsuma (artist)
through (None)
pandora (service)
             precision    recall  f1-score   support

       None       0.99      0.99      0.99      1544
      album       0.52      0.40      0.45       117
     artist       0.86      0.96      0.91       502
      genre       0.71      0.67      0.69        43
 music_item       0.99      1.00      1.00       180
   playlist       1.00      0.62      0.77        82
    service       0.99      0.99      0.99       187
       sort       0.99      1.00      0.99        83
      track       0.71      0.74      0.72       155
       year       0.99      0.99      0.99       127

avg / total       0.94      0.94      0.93      3020



In [62]:
random_fields.test_sample()
random_fields.accuracy_report()

play (None)
me (None)
some (None)
music (None)
by (None)
prince (artist)
alla (artist)
from (None)
the (None)
twenties (year)
             precision    recall  f1-score   support

       None       0.99      0.99      0.99      1554
      album       0.59      0.48      0.53       114
     artist       0.86      0.97      0.91       494
      genre       1.00      0.73      0.84        48
 music_item       1.00      1.00      1.00       177
   playlist       0.78      0.72      0.75        82
    service       0.99      0.99      0.99       188
       sort       0.99      0.97      0.98        75
      track       0.68      0.60      0.63       136
       year       1.00      0.99      1.00       122

avg / total       0.94      0.94      0.94      2990



In [54]:
random_fields.test_sample()
random_fields.accuracy_report()

play (None)
google (service)
music (service)
tunes (None)
             precision    recall  f1-score   support

       None       0.98      1.00      0.99      1584
      album       0.40      0.29      0.34       113
     artist       0.86      0.96      0.91       522
      genre       0.83      0.44      0.58        34
 music_item       0.99      1.00      1.00       164
   playlist       0.75      0.62      0.68        68
    service       1.00      1.00      1.00       197
       sort       0.97      1.00      0.98        87
      track       0.67      0.56      0.61       174
       year       0.99      0.99      0.99       127

avg / total       0.92      0.93      0.92      3070

