# Random fields

## Import and list data-bases

### Import packages

In [1]:
import sys, os, json, numpy as np, pycrfsuite
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Import functions and list data-bases

In [2]:
from utils.rf_utils import alternative_data_base, extract_features, get_entities, extract_words_from_X
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

## Define a class that load the data-base, train a model, and test it

In [3]:
class RandomFields:
    def __init__(self, type_of_query):
        self.type_of_query = type_of_query
        self.model_file_path = f'{models_path}{self.type_of_query}.model'
    
    def drop_duplicates(self):
        temp = [json.dumps(liste) for liste in self.alternative_data_with_duplicates]
        temp = list(set(temp))
        temp = [json.loads(liste) for liste in temp]
        self.alternative_data = temp
        
    def load_db(self,index):
        data = json.load(open(dbs_path  + self.type_of_query+ '.json'))[self.type_of_query]
        self.alternative_data_with_duplicates = alternative_data_base(data)
        self.drop_duplicates()
        X = [extract_features(query) for query in self.alternative_data]
        y = [get_entities(query) for query in self.alternative_data]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)
        if index == 0:
            print(f'For the type of query : {type_of_query} \n')
            print("Raw data looks like : ")
            pprint(data[0])
            print(f"\n Alternative data looks like :")
            pprint(self.alternative_data[0])
            print('\n Data to feed the algorithm looks like : ')
            pprint(X[0])
            
    def train_model(self,l1_penalty=0.1,l2_penalty=0.01,max_iterations=200):
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)
        trainer.set_params({'c1': l1_penalty,
                            'c2': l2_penalty,
                            'max_iterations': max_iterations
                            })
        trainer.train(self.model_file_path)
    def pred(self):
        tagger = pycrfsuite.Tagger()
        tagger.open(self.model_file_path)
        self.y_pred = [tagger.tag(xseq) for xseq in self.X_test] 
        self.correspondance = [(extract_words_from_X(xseq),yseq,tagger.tag(xseq)) for xseq, yseq in zip(self.X_test,self.y_test)]
        
    def test_sample(self):
        self.pred()
        i = 23
        for x, y in zip(self.y_pred[i], [x[1].split("=")[1] for x in self.X_test[i]]):
            print("%s (%s)" % (y, x))
    
    def accuracy_report(self):
        self.pred()
        # Convert the sequences of tags into a 1-dimensional array
        self.predictions = np.array([tag for row in self.y_pred for tag in row])
        self.truths = np.array([tag for row in self.y_test for tag in row])
        print(classification_report(
            self.truths, self.predictions
             ))

## Train and save a model for each type of query

In [4]:
for index, type_of_query in enumerate(list_query_type):
    random_fields = RandomFields(type_of_query)
    random_fields.load_db(index=index)
    #random_fields.train_model()
    random_fields.accuracy_report()
    #break

For the type of query : PlayMusic 

Raw data looks like : 
{'data': [{'text': 'I need to hear the '},
          {'entity': 'music_item', 'text': 'song'},
          {'text': ' '},
          {'entity': 'track', 'text': 'Aspro Mavro'},
          {'text': ' from '},
          {'entity': 'artist', 'text': 'Bill Szymczyk'},
          {'text': ' on '},
          {'entity': 'service', 'text': 'Youtube'}]}

 Alternative data looks like :
[{'entity': 'None', 'text': 'Play'},
 {'entity': 'None', 'text': 'the'},
 {'entity': 'sort', 'text': 'greatest'},
 {'entity': 'music_item', 'text': 'soundtrack'},
 {'entity': 'None', 'text': 'by'},
 {'entity': 'artist', 'text': 'Nhat'},
 {'entity': 'artist', 'text': 'Son'},
 {'entity': 'None', 'text': 'on'},
 {'entity': 'service', 'text': 'Last'},
 {'entity': 'service', 'text': 'Fm'}]

 Data to feed the algorithm looks like : 
[['bias',
  'word.lower=play',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'place_in_query=0',
  'len_query

FileNotFoundError: [Errno 2] No such file or directory: '/Users/francois/Documents/Python_files/NLP/slot_filling/query_dbs/.DS_Store.json'

In [5]:
random_fields = RandomFields("AllQueries")
random_fields.load_db(index=index)

In [6]:
random_fields.accuracy_report()

                            precision    recall  f1-score   support

                      None       1.00      1.00      1.00     12697
                     album       0.90      0.92      0.91       124
                    artist       0.99      0.97      0.98       836
               best_rating       1.00      1.00      1.00       202
                      city       0.98      0.98      0.98       381
     condition_description       1.00      1.00      1.00        86
     condition_temperature       1.00      1.00      1.00       108
                   country       1.00      0.99      0.99       250
                   cuisine       0.98      0.93      0.95        44
          current_location       1.00      1.00      1.00        76
               entity_name       0.93      0.96      0.94       385
                  facility       1.00      0.97      0.99        40
                     genre       0.93      0.96      0.95        28
            geographic_poi       0.98      1.00

# Etudes des erreurs

In [10]:
import pandas as pd

In [11]:
random_fields.correspondance[0]
future_df = []
for query_id, query in enumerate(random_fields.correspondance):
    sentence = query[0]
    truths = query[1]
    predictions = query[2]
    for word, truth_entity, entity_pred in zip(sentence,truths, predictions):
        row={}
        row ={"query_id":query_id, "word":word, "truth_entity":truth_entity,"entity_pred":entity_pred}
        future_df.append(row)

In [12]:
new = pd.DataFrame(future_df)

In [13]:
new["algo_is_not_right"] = new.apply(lambda row : row["truth_entity"]!=row["entity_pred"],axis=1)

In [14]:
new.groupby('query_id')["algo_is_not_right"].sum().reset_index(name="nb_error_in_query").groupby('nb_error_in_query').size()

nb_error_in_query
0.0     2736
1.0       65
2.0       22
3.0       19
4.0        6
5.0        5
6.0        4
8.0        1
16.0       1
dtype: int64

# Combinaison des deux modèles

In [None]:
data = json.load(open(dbs_path  + 'AllQueriesNoDuplicates.json'))

In [None]:
type(data)

In [None]:
import pandas as pd
new = pd.DataFrame(data)

In [None]:
len(new)#.head()