In [1]:
#!pip install spacy

In [2]:
!#python -m spacy download en_core_web_sm

In [3]:
import spacy
import en_core_web_sm
import json
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
#from sklearn.model_selection import train_test_split
from spacy.util import minibatch, compounding

In [4]:
json_file = open('../data/processed/training_set.json')
data = json.load(json_file)

In [5]:
y_training_raw = np.array([datapoint["intent"] for datapoint in data])
x_training = np.array([datapoint["sentence"] for datapoint in data])
encoder = OneHotEncoder(sparse=False)
y_training = encoder.fit_transform(y_training_raw.reshape(-1,1))
print("converted from : \n", encoder.inverse_transform(y_training[:10,:]))
print("\nto one-hot encoded format : \n", y_training[:10,:])


converted from : 
 [['irrelevant']
 ['irrelevant']
 ['purchase']
 ['find-hotel']
 ['irrelevant']
 ['irrelevant']
 ['irrelevant']
 ['irrelevant']
 ['purchase']
 ['purchase']]

to one-hot encoded format : 
 [[0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [6]:
# can I get a list ?
encoder.categories_

[array(['find-around-me', 'find-flight', 'find-hotel', 'find-restaurant',
        'find-train', 'irrelevant', 'provide-showtimes', 'purchase'],
       dtype='<U17')]

In [7]:
print(y_training[2], " : ",x_training[2])
print(len(y_training),len(x_training))

[0. 0. 0. 0. 0. 0. 0. 1.]  :  Le meilleur cabriolet hybrid moins de 5m10 minimum 400 litres de coffre ?
6035 6035


In [8]:
nlp = en_core_web_sm.load()
# Adding the built-in textcat component to the pipeline.
textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [9]:
# Add categories to the tagger
for cat in encoder.categories_[0]:
    print(cat)
    textcat.add_label(cat)

find-around-me
find-flight
find-hotel
find-restaurant
find-train
irrelevant
provide-showtimes
purchase


In [10]:
list_cats_dic = [{'cats' : {encoder.categories_[0][i] : cat for i, cat in enumerate(cats)}} for cats in y_training]
str_x_training = [str(x) for x in x_training] # spacy cannot handle numpy.str_ type which was used to create the list

train_data = list(zip(str_x_training,list_cats_dic))

In [11]:
# Disabling other components
n_iter=10


other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

#    print("Training the model...")
#    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        print(losses)
      # Calling the evaluate() function and printing the scores
#        with textcat.model.use_params(optimizer.averages):
#           scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
#        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
#              .format(losses['textcat'], scores['textcat_p'],
#                      scores['textcat_r'], scores['textcat_f']))

{'textcat': 14.801161134173526}
{'textcat': 8.810288310758327}
{'textcat': 5.672059410231191}
{'textcat': 3.6777939388219076}
{'textcat': 2.9034173682477036}
{'textcat': 2.0157200870486207}
{'textcat': 1.5973034939801913}
{'textcat': 1.3015691981009705}
{'textcat': 1.1956559094297956}
{'textcat': 1.4486953478119189}
