# Random fields

## Import and list data-bases

### Import packages

In [76]:
import sys, os, json
cwd = os.getcwd()
if cwd not in sys.path:
    sys.path.append(cwd)
from pprint import pprint
from sklearn.model_selection import train_test_split
import pycrfsuite
#pip install python-crfsuite

### Import functions and list data-bases

In [None]:
from utils.cleaning import alternative_data_base, extract_features, get_entities
dbs_path = 'query_dbs/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [96]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_name = f'{self.type_of_query}.model'
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        alternative_data = alternative_data_base(data)
        X = [extract_features(query) for query in alternative_data]
        y = [get_entities(query) for query in alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': 0.1,# L1 penalty
                            'c2': 0.01,# L2 penalty
                            'max_iterations': 200,
                            # whether to include transitions that
                            # are possible, but not observed
                            'feature.possible_transitions': True})
        trainer.train(self.model_file_name)
        
    def test_sample(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_name)
        y_pred = [tagger.tag(xseq) for xseq in self.X_test]
        i = 10
        for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
            print("%s (%s)" % (y, x))

## Train and save a model for each type of query

In [97]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    #random_fields.train_model()

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'I'},
 {'entity': 'None', 'text': 'need'},
 {'entity': 'None', 'text': 'to'},
 {'entity': 'None', 'text': 'hear'},
 {'entity': 'None', 'text': 'the'},
 {'entity': 'music_item', 'text': 'song'},
 {'entity': 'track', 'text': 'Aspro'},
 {'entity': 'track', 'text': 'Mavro'},
 {'entity': 'None', 'text': 'from'},
 {'entity': 'artist', 'text': 'Bill'},
 {'entity': 'artist', 'text': 'Szymczyk'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Youtube'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=i',

In [82]:
random_fields.test_sample()

open (None)
spotify (None)
and (None)
play (None)
a (object_name)
song (object_name)
from (object_name)
the (rating_value)
twenties (None)
by (None)
richard (best_rating)
