# Random fields

## Import and list data-bases

### Import packages

In [32]:
import sys, os, json, numpy as np, pycrfsuite
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Import functions and list data-bases

In [33]:
from utils.cleaning import alternative_data_base, extract_features, get_entities
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [92]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
    
    def drop_duplicates(self):
        temp = [json.dumps(liste) for liste in self.alternative_data_with_duplicates]
        temp = list(set(temp))
        temp = [json.loads(liste) for liste in temp]
        self.alternative_data = temp
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        print(len(data))
        self.alternative_data_with_duplicates = alternative_data_base(data)
        self.drop_duplicates()
        X = [extract_features(query) for query in self.alternative_data]
        y = [get_entities(query) for query in self.alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(self.alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self,l1_penalty=0.1,l2_penalty=0.01,max_iterations=200):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': l1_penalty,
                            'c2': l2_penalty,
                            'max_iterations': max_iterations
                            })
        trainer.train(self.model_file_path)
    def pred(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        self.y_pred = [tagger.tag(xseq) for xseq in self.X_test]        
        
    def test_sample(self):
        self.pred()
        i = 23
        for x, y in zip(self.y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))
    
    def accuracy_report(self):
        self.pred()
        # Convert the sequences of tags into a 1-dimensional array
        predictions = np.array([tag for row in self.y_pred for tag in row])
        truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            truths, predictions
             ))

## Train and save a model for each type of query

In [69]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    random_fields.train_model()
    break

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'I'},
 {'entity': 'None', 'text': 'need'},
 {'entity': 'None', 'text': 'to'},
 {'entity': 'None', 'text': 'hear'},
 {'entity': 'None', 'text': 'the'},
 {'entity': 'music_item', 'text': 'song'},
 {'entity': 'track', 'text': 'Aspro'},
 {'entity': 'track', 'text': 'Mavro'},
 {'entity': 'None', 'text': 'from'},
 {'entity': 'artist', 'text': 'Bill'},
 {'entity': 'artist', 'text': 'Szymczyk'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Youtube'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=i',

TypeError: train_model() missing 1 required positional argument: 'c1'

In [90]:
print(len(random_fields.alternative_data))
len(random_fields.alternative_data_with_duplicates)

14294


14484

In [93]:
random_fields = RandomFields("AllQueries")
random_fields.load_db(index=index)

14484
For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'I'},
 {'entity': 'None', 'text': 'need'},
 {'entity': 'None', 'text': 'a'},
 {'entity': 'restaurant_type', 'text': 'restaurant'},
 {'entity': 'None', 'text': 'in'},
 {'entity': 'state', 'text': 'Iowa'},
 {'entity': 'None', 'text': 'for'},
 {'entity': 'timeRange', 'text': '0'},
 {'entity': 'timeRange', 'text': "o'clock"}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=i',
  'word.isupper=True',
  'word.istitle=True',
  'word.isdigit=False',
  'place_in_query=0',
  'len_query=9',
  'BOS',
  '+1:word.lower=nee

In [94]:
random_fields.train_model()
#random_fields.test_sample()
random_fields.accuracy_report()

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 48078
Seconds required: 0.740

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 271794.706236
Feature norm: 1.000000
Error norm: 75221.128434
Active features: 47874
Line search trials: 1
Line search step: 0.000008
Seconds required for this iteration: 1.093

***** Iteration #2 *****
Loss: 252181.674455
Feature norm: 1.738644
Error norm: 59168.342952
Active features: 47147
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.093

***** Iteration #3 *****
Loss: 242530.953753
Feature norm: 1.635144
Error norm: 30748.471458
Active features: 47854
Line search trials: 1
Line search step: 1.000000
Seconds required 

***** Iteration #39 *****
Loss: 6702.062858
Feature norm: 212.694379
Error norm: 1887.850439
Active features: 33263
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.625

***** Iteration #40 *****
Loss: 6391.277581
Feature norm: 216.870261
Error norm: 611.208065
Active features: 32977
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.521

***** Iteration #41 *****
Loss: 6104.676583
Feature norm: 221.769768
Error norm: 569.388594
Active features: 32178
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.528

***** Iteration #42 *****
Loss: 5900.480263
Feature norm: 225.217361
Error norm: 211.750369
Active features: 31733
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.488

***** Iteration #43 *****
Loss: 5689.930613
Feature norm: 228.807382
Error norm: 446.303333
Active features: 31307
Line search trials: 1
Line search step: 1.000000
Sec

***** Iteration #82 *****
Loss: 4306.620900
Feature norm: 255.776754
Error norm: 275.892522
Active features: 19334
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.443

***** Iteration #83 *****
Loss: 4298.666016
Feature norm: 256.034393
Error norm: 224.772825
Active features: 19234
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.432

***** Iteration #84 *****
Loss: 4292.686882
Feature norm: 256.336886
Error norm: 246.946280
Active features: 19149
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.419

***** Iteration #85 *****
Loss: 4285.825560
Feature norm: 256.587749
Error norm: 227.621537
Active features: 19082
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.438

***** Iteration #86 *****
Loss: 4280.417002
Feature norm: 256.871319
Error norm: 245.172696
Active features: 18972
Line search trials: 1
Line search step: 1.000000
Seco

***** Iteration #125 *****
Loss: 4132.042825
Feature norm: 263.719816
Error norm: 160.882363
Active features: 17050
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.427

***** Iteration #126 *****
Loss: 4129.202975
Feature norm: 263.843140
Error norm: 181.813067
Active features: 17031
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.425

***** Iteration #127 *****
Loss: 4125.718721
Feature norm: 263.984139
Error norm: 160.266954
Active features: 16980
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.418

***** Iteration #128 *****
Loss: 4123.044096
Feature norm: 264.107363
Error norm: 181.256831
Active features: 16931
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.416

***** Iteration #129 *****
Loss: 4119.523093
Feature norm: 264.234820
Error norm: 156.481061
Active features: 16895
Line search trials: 1
Line search step: 1.000000

***** Iteration #167 *****
Loss: 4034.684188
Feature norm: 265.720969
Error norm: 126.291097
Active features: 16033
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.420

***** Iteration #168 *****
Loss: 4033.027306
Feature norm: 265.735557
Error norm: 104.280554
Active features: 16018
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.415

***** Iteration #169 *****
Loss: 4031.956229
Feature norm: 265.748969
Error norm: 120.815999
Active features: 15993
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.417

***** Iteration #170 *****
Loss: 4030.366973
Feature norm: 265.760523
Error norm: 102.434843
Active features: 15970
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.413

***** Iteration #171 *****
Loss: 4029.392880
Feature norm: 265.771411
Error norm: 118.249292
Active features: 15939
Line search trials: 1
Line search step: 1.000000

In [62]:
random_fields.test_sample()
random_fields.accuracy_report()

play (None)
me (None)
some (None)
music (None)
by (None)
prince (artist)
alla (artist)
from (None)
the (None)
twenties (year)
             precision    recall  f1-score   support

       None       0.99      0.99      0.99      1554
      album       0.59      0.48      0.53       114
     artist       0.86      0.97      0.91       494
      genre       1.00      0.73      0.84        48
 music_item       1.00      1.00      1.00       177
   playlist       0.78      0.72      0.75        82
    service       0.99      0.99      0.99       188
       sort       0.99      0.97      0.98        75
      track       0.68      0.60      0.63       136
       year       1.00      0.99      1.00       122

avg / total       0.94      0.94      0.94      2990



In [54]:
random_fields.test_sample()
random_fields.accuracy_report()

play (None)
google (service)
music (service)
tunes (None)
             precision    recall  f1-score   support

       None       0.98      1.00      0.99      1584
      album       0.40      0.29      0.34       113
     artist       0.86      0.96      0.91       522
      genre       0.83      0.44      0.58        34
 music_item       0.99      1.00      1.00       164
   playlist       0.75      0.62      0.68        68
    service       1.00      1.00      1.00       197
       sort       0.97      1.00      0.98        87
      track       0.67      0.56      0.61       174
       year       0.99      0.99      0.99       127

avg / total       0.92      0.93      0.92      3070

