# Random fields

## Import and list data-bases

### Import packages

In [7]:
import sys, os, json
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
import pycrfsuite
#pip install python-crfsuite

### Import functions and list data-bases

In [12]:
from utils.cleaning import alternative_data_base, extract_features, get_entities
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [22]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        alternative_data = alternative_data_base(data)
        X = [extract_features(query) for query in alternative_data]
        y = [get_entities(query) for query in alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': 0.1,# L1 penalty
                            'c2': 0.01,# L2 penalty
                            'max_iterations': 200,
                            # whether to include transitions that
                            # are possible, but not observed
                            'feature.possible_transitions': True})
        trainer.train(self.model_file_path)
        
    def test_sample(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        y_pred = [tagger.tag(xseq) for xseq in self.X_test]
        i = 23
        for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))

## Train and save a model for each type of query

In [23]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    random_fields.train_model()
    break

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'I'},
 {'entity': 'None', 'text': 'need'},
 {'entity': 'None', 'text': 'to'},
 {'entity': 'None', 'text': 'hear'},
 {'entity': 'None', 'text': 'the'},
 {'entity': 'music_item', 'text': 'song'},
 {'entity': 'track', 'text': 'Aspro'},
 {'entity': 'track', 'text': 'Mavro'},
 {'entity': 'None', 'text': 'from'},
 {'entity': 'artist', 'text': 'Bill'},
 {'entity': 'artist', 'text': 'Szymczyk'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Youtube'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=i',

***** Iteration #17 *****
Loss: 2548.126795
Feature norm: 32.151621
Error norm: 666.039525
Active features: 8554
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #18 *****
Loss: 2221.055126
Feature norm: 35.917069
Error norm: 714.891819
Active features: 8534
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #19 *****
Loss: 1892.371361
Feature norm: 40.894347
Error norm: 402.038116
Active features: 8499
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #20 *****
Loss: 1607.167848
Feature norm: 46.346535
Error norm: 267.862427
Active features: 7857
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #21 *****
Loss: 1342.931092
Feature norm: 53.337604
Error norm: 362.354863
Active features: 7207
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #68 *****
Loss: 474.803861
Feature norm: 88.932049
Error norm: 8.565909
Active features: 2677
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #69 *****
Loss: 474.310952
Feature norm: 88.920769
Error norm: 18.480751
Active features: 2628
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #70 *****
Loss: 474.197850
Feature norm: 88.893293
Error norm: 41.215274
Active features: 2602
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #71 *****
Loss: 473.602036
Feature norm: 88.934990
Error norm: 15.611083
Active features: 2597
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #72 *****
Loss: 473.368295
Feature norm: 88.930734
Error norm: 17.258370
Active features: 2596
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #109 *****
Loss: 468.113628
Feature norm: 89.836261
Error norm: 11.860889
Active features: 2414
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #110 *****
Loss: 468.066662
Feature norm: 89.840195
Error norm: 18.134549
Active features: 2406
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #111 *****
Loss: 467.950167
Feature norm: 89.855581
Error norm: 12.996534
Active features: 2405
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #112 *****
Loss: 467.911389
Feature norm: 89.857510
Error norm: 18.837065
Active features: 2404
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #113 *****
Loss: 467.801327
Feature norm: 89.873295
Error norm: 13.144041
Active features: 2404
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #161 *****
Loss: 465.137771
Feature norm: 90.086602
Error norm: 14.879199
Active features: 2340
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #162 *****
Loss: 465.077657
Feature norm: 90.080717
Error norm: 9.712823
Active features: 2340
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #163 *****
Loss: 465.059634
Feature norm: 90.080289
Error norm: 14.069061
Active features: 2339
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #164 *****
Loss: 465.004829
Feature norm: 90.074800
Error norm: 9.017829
Active features: 2344
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #165 *****
Loss: 464.983826
Feature norm: 90.072851
Error norm: 12.628229
Active features: 2344
Line search trials: 1
Line search step: 1.000000
Seconds required for 

In [24]:
random_fields.test_sample()

play (None)
the (None)
song (music_item)
american (track)
patrol (track)
by (None)
lauryn (artist)
hill (artist)
