# Random fields

## Import and list data-bases

### Import packages

In [75]:
import sys, os, json, numpy as np, pandas as pd, pycrfsuite
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Import functions and list data-bases

In [2]:
from utils.rf_utils import alternative_data_base, extract_features, get_entities, extract_words_from_X
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files if '.json' in file_name]

## Define a class that load the data-base, train a model, and test it

In [86]:
from utils.classif_model import predict
# The model to predict the type of query given the query is imported here and it will be used in the smart_pred method below
test = ["Rate Harry Potter first book zero of 6 ","Add Also sprach Zarathustra to legend movie soundtracks"]
test = np.array(test)
predict(test)

['RateBook', 'AddToPlaylist']

In [76]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
        self.list_query_type = [query_type for query_type in list_query_type if query_type!='AllQueries' ]
    
    def drop_duplicates(self):
        temp = [json.dumps(liste) for liste in self.alternative_data_with_duplicates]
        temp = list(set(temp))
        temp = [json.loads(liste) for liste in temp]
        self.alternative_data = temp
        
    def load_db(self,index=1):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        self.alternative_data_with_duplicates = alternative_data_base(data)
        self.drop_duplicates()
        X = [extract_features(query) for query in self.alternative_data]
        y = [get_entities(query) for query in self.alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.25,random_state=12)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(self.alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self,l1_penalty=0.1,l2_penalty=0.01,max_iterations=200):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': l1_penalty,
                            'c2': l2_penalty,
                            'max_iterations': max_iterations
                            })
        trainer.train(self.model_file_path)
    def pred(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        self.y_pred = [tagger.tag(xseq) for xseq in self.X_test] 
        self.correspondance = [(extract_words_from_X(xseq),yseq,tagger.tag(xseq)) for xseq, yseq in zip(self.X_test,self.y_test)]
    
    def smart_pred(self):
        test_type_of_query = np.array([xseq[0][-1] for xseq in self.X_test])
        self.predictions_type_of_query  = predict(test_type_of_query)
        predictions = {}
        for type_of_query in self.list_query_type:
            tagger = pycrfsuite.Tagger()
            tagger.open(f'{models_path}{type_of_query}.model')
            for index, xseq, type_of_query_predict in zip(range(len(test_type_of_query)),self.X_test,self.predictions_type_of_query):
                if type_of_query_predict==type_of_query:
                    predictions[index]= tagger.tag(xseq)
        self.y_smart_pred = [predictions[i] for i in range(len(predictions))]

        
    def test_sample(self):
        self.pred()
        i = 23
        for x, y in zip(self.y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))
    
    def accuracy_report(self,smart_pred=False):
        if smart_pred:
            self.smart_pred()
            y_pred=self.y_smart_pred
        else:
            self.pred()
            y_pred=self.y_pred
        # Convert the sequences of tags into a 1-dimensional array
        self.predictions = np.array([tag for row in y_pred for tag in row])
        self.truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            self.truths, self.predictions
             ))
    def accuracy_report_smart(self):
        self.smart_pred()
        # Convert the sequences of tags into a 1-dimensional array
        self.predictions = np.array([tag for row in self.y_smart_pred for tag in row])
        self.truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            self.truths, self.predictions
             ))

## Train and save a model for each type of query

In [84]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    #random_fields.train_model()
    #model has already be trained and saved in models, since a random has been set for train_test_split, we are sure to be iso
    random_fields.accuracy_report()

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'Play'},
 {'entity': 'track', 'text': 'All'},
 {'entity': 'track', 'text': 'Things'},
 {'entity': 'track', 'text': 'Must'},
 {'entity': 'track', 'text': 'Pass'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=play',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'place_in_query=0',
  'len_query=5',
  'BOS',
  '+1:word.lower=all',
  '+1:word.istitle=True',
  '+1:word.isdigit=False',
  'Play All Things Must Pass'],
 ['bias',
  'word.lower=all',
  'word.isupper=False',
  'word.istitle=True',

                      precision    recall  f1-score   support

                None       0.99      0.99      0.99      2073
       location_name       0.99      0.99      0.99       348
          movie_name       0.99      0.97      0.98       790
          movie_type       1.00      0.99      1.00       220
object_location_type       0.99      0.99      0.99       179
         object_type       0.97      1.00      0.99       313
    spatial_relation       0.98      1.00      0.99       292
           timeRange       0.97      0.95      0.96       175

         avg / total       0.99      0.99      0.99      4390

                            precision    recall  f1-score   support

                      None       1.00      0.99      0.99      1776
               best_rating       1.00      1.00      1.00       259
               object_name       1.00      0.99      1.00       956
object_part_of_series_type       1.00      1.00      1.00        82
             object_select       0.9

In [66]:
random_fields = RandomFields("PlayMusic")
random_fields.load_db()
random_fields.accuracy_report()
random_fields.test_sample()

             precision    recall  f1-score   support

       None       0.98      0.99      0.99      1903
      album       0.69      0.36      0.47       168
     artist       0.86      0.97      0.91       630
      genre       0.80      0.68      0.74        47
 music_item       1.00      1.00      1.00       211
   playlist       0.86      0.53      0.65       112
    service       1.00      0.97      0.99       236
       sort       0.95      0.99      0.97        98
      track       0.64      0.75      0.69       185
       year       0.99      1.00      1.00       144

avg / total       0.93      0.93      0.92      3734

i'd (None)
like (None)
to (None)
listen (None)
to (None)
tom (artist)
cochrane (artist)
's (None)
1990 (year)
ep (music_item)
on (None)
zvooq (service)


In [67]:
random_fields = RandomFields("AddToPlaylist")
random_fields.load_db()
random_fields.accuracy_report()
random_fields.test_sample()

                precision    recall  f1-score   support

          None       0.99      0.98      0.98      1877
        artist       0.78      0.83      0.81       373
   entity_name       0.82      0.71      0.76       457
    music_item       1.00      1.00      1.00       237
      playlist       0.94      0.97      0.96      1362
playlist_owner       0.96      0.95      0.96       269

   avg / total       0.94      0.94      0.94      4575

add (None)
jeff (artist)
burrows (artist)
album (music_item)
to (None)
my (playlist_owner)
country (playlist)
hits (playlist)
playlist (None)


# Etudes des erreurs

In [77]:
random_fields = RandomFields("AllQueries")
random_fields.load_db()
random_fields.accuracy_report()

                            precision    recall  f1-score   support

                      None       0.99      0.99      0.99     16005
                     album       0.84      0.21      0.33       156
                    artist       0.84      0.87      0.85       927
               best_rating       0.99      1.00      0.99       265
                      city       0.76      0.90      0.82       472
     condition_description       0.98      0.98      0.98       128
     condition_temperature       0.99      0.99      0.99       144
                   country       0.93      0.74      0.82       335
                   cuisine       0.90      0.67      0.77        66
          current_location       1.00      0.99      0.99        97
               entity_name       0.77      0.70      0.74       470
                  facility       1.00      1.00      1.00        33
                     genre       0.92      0.60      0.73        60
            geographic_poi       0.92      0.84

In [80]:
future_df = []
for query_id, query in enumerate(random_fields.correspondance):
    sentence = query[0]
    truths = query[1]
    predictions = query[2]
    for word, truth_entity, entity_pred in zip(sentence,truths, predictions):
        row={}
        row ={"query_id":query_id, "word":word, "truth_entity":truth_entity,"entity_pred":entity_pred}
        future_df.append(row)

In [81]:
df = pd.DataFrame(future_df)
df["algo_is_not_right"] = df.apply(lambda row : row["truth_entity"]!=row["entity_pred"],axis=1)
df.groupby('query_id')["algo_is_not_right"].sum().reset_index(name="nb_error_in_query").groupby('nb_error_in_query').size()

nb_error_in_query
0.0     2874
1.0      276
2.0      180
3.0      109
4.0       70
5.0       22
6.0       20
7.0        7
8.0        9
9.0        5
10.0       1
11.0       1
dtype: int64

# Combinaison des deux modèles

accuracy_report(smart_pred=False) is a method that run method pred or smart_pred (according to the parameter smart_pred)

In [83]:
random_fields.accuracy_report(smart_pred=True)

['find animated movies at Amco Entertainment'
 "I'd like to watch the TV series called Fires of Life"
 'I need a reservation for february 27, 2020 at a bar that serves paté'
 ... 'this series gets 2 out of 6 stars'
 'Look for the novel Behind closed doors'
 'Find Business Ethics: A European Review']
                            precision    recall  f1-score   support

                      None       1.00      1.00      1.00     16005
                     album       0.97      0.81      0.89       156
                    artist       0.95      0.98      0.97       927
               best_rating       1.00      1.00      1.00       265
                      city       0.93      0.96      0.94       472
     condition_description       1.00      1.00      1.00       128
     condition_temperature       1.00      0.99      1.00       144
                   country       0.96      0.91      0.94       335
                   cuisine       0.98      0.94      0.96        66
          current_