# Random fields

## Import and list data-bases

### Import packages

In [4]:
import sys, os, json, numpy as np, pycrfsuite
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Import functions and list data-bases

In [5]:
from utils.cleaning import alternative_data_base, extract_features, get_entities
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [69]:
random_fields.y_test

[['None',
  'None',
  'None',
  'None',
  'object_type',
  'None',
  'object_name',
  'object_name'],
 ['None',
  'None',
  'None',
  'None',
  'None',
  'current_location',
  'None',
  'timeRange',
  'None'],
 ['object_name',
  'object_name',
  'object_name',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'rating_value',
  'None',
  'None',
  'None',
  'None',
  'None',
  'best_rating'],
 ['None',
  'None',
  'object_select',
  'object_type',
  'rating_value',
  'None',
  'None',
  'best_rating'],
 ['None',
  'None',
  'None',
  'None',
  'music_item',
  'None',
  'None',
  'music_item',
  'None',
  'album'],
 ['None',
  'None',
  'object_type',
  'object_type',
  'None',
  'location_name',
  'location_name'],
 ['None', 'None', 'artist', 'artist', 'None', 'None', 'year'],
 ['None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'restaurant_type',
  'None',
  'party_size_number'],
 ['None',
  'None',
  'music_item',
  'None',
  'None',
  'playlist',
  'playlist',
  

In [79]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
    
    def drop_duplicates(self):
        temp = [json.dumps(liste) for liste in self.alternative_data_with_duplicates]
        temp = list(set(temp))
        temp = [json.loads(liste) for liste in temp]
        self.alternative_data = temp
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        self.alternative_data_with_duplicates = alternative_data_base(data)
        self.drop_duplicates()
        X = [extract_features(query) for query in self.alternative_data]
        y = [get_entities(query) for query in self.alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(self.alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self,l1_penalty=0.1,l2_penalty=0.01,max_iterations=200):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': l1_penalty,
                            'c2': l2_penalty,
                            'max_iterations': max_iterations
                            })
        trainer.train(self.model_file_path)
    def pred(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        self.y_pred = [tagger.tag(xseq) for xseq in self.X_test] 
        self.correspondance = [(extract_words_from_X(xseq),yseq,tagger.tag(xseq)) for xseq, yseq in zip(self.X_test,self.y_test)]
        
    def test_sample(self):
        self.pred()
        i = 23
        for x, y in zip(self.y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))
    
    def accuracy_report(self):
        self.pred()
        # Convert the sequences of tags into a 1-dimensional array
        self.predictions = np.array([tag for row in self.y_pred for tag in row])
        self.truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            self.truths, self.predictions
             ))

In [71]:
random_fields.X_train[0]

[['bias',
  'word.lower=i',
  'word.isupper=True',
  'word.istitle=True',
  'word.isdigit=False',
  'place_in_query=0',
  'len_query=14',
  'BOS',
  '+1:word.lower=need',
  '+1:word.istitle=False',
  '+1:word.isdigit=False'],
 ['bias',
  'word.lower=need',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'place_in_query=1',
  'len_query=14',
  '-1:word.lower=i',
  '-1:word.istitle=True',
  '-1:word.isdigit=False',
  '+1:word.lower=a',
  '+1:word.istitle=False',
  '+1:word.isdigit=False'],
 ['bias',
  'word.lower=a',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'place_in_query=2',
  'len_query=14',
  '-1:word.lower=need',
  '-1:word.istitle=False',
  '-1:word.isdigit=False',
  '+1:word.lower=reservation',
  '+1:word.istitle=False',
  '+1:word.isdigit=False'],
 ['bias',
  'word.lower=reservation',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'place_in_query=3',
  'len_query=14',
  '-1:word.lower=a',
  '-1:

## Train and save a model for each type of query

In [29]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    #random_fields.train_model()
    random_fields.accuracy_report()
    #break

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'play'},
 {'entity': 'playlist', 'text': 'Punk'},
 {'entity': 'playlist', 'text': 'Essentials'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Zvooq'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=play',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'place_in_query=0',
  'len_query=5',
  'BOS',
  '+1:word.lower=punk',
  '+1:word.istitle=True',
  '+1:word.isdigit=False'],
 ['bias',
  'word.lower=punk',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=

                            precision    recall  f1-score   support

                      None       1.00      1.00      1.00      1467
               best_rating       1.00      1.00      1.00       211
               object_name       1.00      1.00      1.00       810
object_part_of_series_type       1.00      1.00      1.00        58
             object_select       0.99      0.99      0.99       204
               object_type       1.00      1.00      1.00       200
               rating_unit       1.00      1.00      1.00       239
              rating_value       1.00      1.00      1.00       405

               avg / total       1.00      1.00      1.00      3594



In [80]:
random_fields = RandomFields("AllQueries")
random_fields.load_db(index=index)

In [81]:
random_fields.accuracy_report()

                            precision    recall  f1-score   support

                      None       1.00      1.00      1.00     12718
                     album       0.93      0.86      0.89       106
                    artist       0.98      0.98      0.98       818
               best_rating       1.00      1.00      1.00       215
                      city       0.95      0.98      0.96       360
     condition_description       1.00      1.00      1.00        94
     condition_temperature       1.00      1.00      1.00        74
                   country       0.99      0.96      0.98       247
                   cuisine       0.98      0.92      0.95        48
          current_location       1.00      1.00      1.00        74
               entity_name       0.93      0.93      0.93       430
                  facility       1.00      1.00      1.00        39
                     genre       0.97      0.97      0.97        30
            geographic_poi       0.99      0.97

# Etudes des erreurs

In [83]:
random_fields.correspondance[0]
future_df = []
for query_id, query in enumerate(random_fields.correspondance):
    sentence = query[0]
    truths = query[1]
    predictions = query[2]
    for word, truth_entity, entity_pred in zip(sentence,truths, predictions):
        row={}
        row ={"query_id":query_id, "word":word, "truth_entity":truth_entity,"entity_pred":entity_pred}
        future_df.append(row)

In [84]:
new = pd.DataFrame(future_df)

In [92]:
new["algo_is_not_right"] = new.apply(lambda row : row["truth_entity"]!=row["entity_pred"],axis=1)

In [98]:
new.groupby('query_id')["algo_is_not_right"].sum().reset_index(name="nb_error").groupby('nb_error').size()

nb_error
0.0    2740
1.0      57
2.0      25
3.0      21
4.0       7
5.0       4
6.0       3
7.0       1
8.0       1
dtype: int64

In [72]:
def extract_words_from_X(X):
    words = []
    for features in X:
        word = features[1]
        word = word.replace('word.lower=','')
        words.append(word)
    return words

In [42]:
tryed = pd.Series(random_fields.X_test).map(extract_words_from_X)

In [51]:
new=pd.DataFrame(index=range(2858))
new["text"] = tryed
new["pred"] = pd.Series(random_fields.predictions)
new["truths"] = pd.Series(random_fields.truths)

In [55]:
len(new)

2858

In [54]:
new[new.pred == new.truths]

Unnamed: 0,text,pred,truths
0,the,object_name,object_name
1,add,object_name,object_name
2,give,object_name,object_name
3,put,object_part_of_series_type,object_part_of_series_type
4,add,,
5,book,,
6,where,,
7,will,rating_value,rating_value
8,use,rating_unit,rating_unit
9,i'd,,


In [53]:
new[new.pred != new.truths]

Unnamed: 0,text,pred,truths
393,find,party_size_number,cuisine
641,give,object_type,object_part_of_series_type
821,show,,object_name
860,what's,music_item,object_type
1020,add,object_select,
1186,book,object_name,playlist
1781,play,object_type,
1784,add,city,movie_name
1799,will,rating_value,party_size_number
2044,i,object_select,


In [58]:
random_fields.accuracy_report()

                            precision    recall  f1-score   support

                      None       1.00      1.00      1.00     12860
                     album       0.91      0.84      0.87       179
                    artist       0.98      0.97      0.98       825
               best_rating       0.99      1.00      0.99       219
                      city       0.97      0.99      0.98       364
     condition_description       1.00      1.00      1.00        77
     condition_temperature       1.00      1.00      1.00       100
                   country       0.99      0.98      0.98       278
                   cuisine       1.00      0.91      0.95        55
          current_location       1.00      1.00      1.00        94
               entity_name       0.95      0.94      0.94       394
                  facility       1.00      1.00      1.00        37
                     genre       0.98      0.98      0.98        54
            geographic_poi       1.00      0.98

# Combinaison des deux modèles

In [15]:
data = json.load(open(dbs_path  + 'AllQueriesNoDuplicates.json'))

In [16]:
type(data)

list

In [20]:
import pandas as pd
new = pd.DataFrame(data)

In [22]:
len(new)#.head()

14294