##### Copyright 2018 The TensorFlow Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text classification with an RNN

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/text/tutorials/text_classification_rnn"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/text/blob/master/docs/tutorials/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/text/docs/tutorials/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.

## Setup

In [2]:
import pandas as pd
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

Import `matplotlib` and create a helper function to plot graphs:

In [3]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])



# Get Datasets and Dataframes

In [4]:
testingDatasets = []
trainingDatasets = []

for i in range(2):
  testingDatasets.append(pd.read_json('https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/test/dialogues_00' + str(i+1) + '.json'))

for i in range(9):
  trainingDatasets.append(pd.read_json('https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/train/dialogues_00' + str(i+1) + '.json'))

for i in range(8):
  trainingDatasets.append(pd.read_json('https://raw.githubusercontent.com/budzianowski/multiwoz/master/data/MultiWOZ_2.2/train/dialogues_0' + str(i+10) + '.json'))

testingDatasets = pd.concat([testingDatasets[0], testingDatasets[1]])
trainingDatasets = pd.concat([trainingDatasets[0], trainingDatasets[1],trainingDatasets[2],trainingDatasets[3],trainingDatasets[4],trainingDatasets[5],trainingDatasets[6],trainingDatasets[7],trainingDatasets[8],trainingDatasets[9],trainingDatasets[10],trainingDatasets[11],trainingDatasets[12],trainingDatasets[13],trainingDatasets[14],trainingDatasets[15],trainingDatasets[16]])
testingDatasets

KeyboardInterrupt: 

In [None]:
trainingDatasets

In [None]:
# seleciona no json os campos relevantes. Apenas as falas dos usuários (ignora os do System)
def getDataframes(dataframe):
  entries_classifier = []
  entries_extractor = []

  for turn in dataframe['turns']:
    for entry in turn:
      if entry['speaker'] == 'USER':
        intent = ''
        slot_values = []
        requested_slots = []
        phrase = entry['utterance']
        for frame in entry['frames']:  
          if frame['state']['active_intent'] != 'NONE':
            intent = frame['state']['active_intent']
            slot_values = frame['state']['slot_values']
            requested_slots = frame['state']['requested_slots']
        entries_classifier.append({'entry': phrase, 'intent': intent})
        entries_extractor.append({'entry': phrase, 'intent': intent, 'slot_values': slot_values, 'requested_slots': requested_slots})

  classifier = pd.DataFrame.from_dict(entries_classifier)
  extractor = pd.DataFrame.from_dict(entries_extractor)

  return classifier, extractor

test_classifier, test_extractor = getDataframes(testingDatasets)
train_classifier, train_extractor = getDataframes(trainingDatasets)

test_classifier

In [None]:
trainingDatasets['turns']

# Classify Intents


## Recognize intents from other domains/services

In [None]:
# aqueles intents que não interessam ao projeto são marcados como "Invalid"
def recognizeIntents(intent):
  if (intent == 'find_restaurant' or intent == 'book_restaurant'):
    return intent
  return 'Invalid'

train_classifier['intent'] = train_classifier['intent'].apply(recognizeIntents)
test_classifier['intent'] = test_classifier['intent'].apply(recognizeIntents)
train_extractor['intent'] = train_extractor['intent'].apply(recognizeIntents)
test_extractor['intent'] = test_extractor['intent'].apply(recognizeIntents)
train_classifier['intent'].unique()


In [None]:
test_classifier['intent'].unique()

In [None]:
train_extractor['intent'].unique()

In [None]:
test_extractor['intent'].unique()

## Create categories for intents


In [None]:
# transforma as strings categóricas em números, permitindo processar
train_classifier['intent'] = pd.Categorical(train_classifier['intent'])
train_classifier['intent'] = train_classifier.intent.cat.codes
train_classifier['intent'].unique()

In [None]:
test_classifier['intent'] = pd.Categorical(test_classifier['intent'])
test_classifier['intent'] = test_classifier.intent.cat.codes
test_classifier['intent'].unique()

## Setup input pipeline


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_classifier['entry'], train_classifier['intent']))
test_dataset = tf.data.Dataset.from_tensor_slices((test_classifier['entry'], test_classifier['intent']))
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# agora, ele já está com a batch de 64 entradas
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

## Create the text encoder

The raw text loaded by `tfds` needs to be processed before it can be used in a model. The simplest way to process text for training is using the `TextVectorization` layer. This layer has many capabilities, but this tutorial sticks to the default behavior.

Create the layer, and pass the dataset's text to the layer's `.adapt` method:

In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
# pegando nossos textos e criando o vocabulário
encoder.adapt(train_dataset.map(lambda text, label: text))

The `.adapt` method sets the layer's vocabulary. Here are the first 20 tokens. After the padding and unknown tokens they're sorted by frequency: 

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

Once the vocabulary is set, the layer can encode text into indices. The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed `output_sequence_length`):

In [None]:
# pegamos primeiros três de "example",
# textos extraídos do train_dataset,
# para servir de exemplo
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
# como se pode ver, cada número representa uma palavra
example.numpy()[:3]

With the default settings, the process is not completely reversible. There are three main reasons for that:

1. The default value for `preprocessing.TextVectorization`'s `standardize` argument is `"lower_and_strip_punctuation"`.
2. The limited vocabulary size and lack of character-based fallback results in some unknown tokens.

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

## Create the model

![A drawing of the information flow in the model](https://github.com/tensorflow/text/blob/master/docs/tutorials/images/bidirectional.png?raw=1)

Above is a diagram of the model. 

1. This model can be build as a `tf.keras.Sequential`.

2. The first layer is the `encoder`, which converts the text to a sequence of token indices.

3. After the encoder is an embedding layer. An embedding layer stores one vector per word. When called, it converts the sequences of word indices to sequences of vectors. These vectors are trainable. After training (on enough data), words with similar meanings often have similar vectors.

  This index-lookup is much more efficient than the equivalent operation of passing a one-hot encoded vector through a `tf.keras.layers.Dense` layer.

4. A recurrent neural network (RNN) processes sequence input by iterating through the elements. RNNs pass the outputs from one timestep to their input on the next timestep.

  The `tf.keras.layers.Bidirectional` wrapper can also be used with an RNN layer. This propagates the input forward and backwards through the RNN layer and then concatenates the final output. 

  * The main advantage of a bidirectional RNN is that the signal from the beginning of the input doesn't need to be processed all the way through every timestep to affect the output.  

  * The main disadvantage of a bidirectional RNN is that you can't efficiently stream predictions as words are being added to the end.

5. After the RNN has converted the sequence to a single vector the two `layers.Dense` do some final processing, and convert from this vector representation to a single logit as the classification output. 


The code to implement this is below:

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.summary()

Please note that Keras sequential model is used here since all the layers in the model only have single input and produce single output. In case you want to use stateful RNN layer, you might want to build your model with Keras functional API or model subclassing so that you can retrieve and reuse the RNN layer states. Please check [Keras RNN guide](https://www.tensorflow.org/guide/keras/rnn#rnn_state_reuse) for more details.

The embedding layer [uses masking](https://www.tensorflow.org/guide/keras/masking_and_padding) to handle the varying sequence-lengths. All the layers after the `Embedding` support masking:

In [None]:
print([layer.supports_masking for layer in model.layers])

Compile the Keras model to configure the training process:

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

## Train the model

In [None]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')

In [None]:
predictions = model.predict(['i need a place to dine in the center thats expensive'])
c = np.array(['Invalid', 'find_restaurant', 'book_restaurant'])
print(predictions[0])
print(c[np.argmax(predictions[0])])

# Extract entities


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
#   "restaurant-area",
#   "restaurant-bookday",
#   "restaurant-bookpeople",
#   "restaurant-booktime",
#   "restaurant-food",
#   "restaurant-name",
#   "restaurant-pricerange",
test_extractor['slot_values']

In [None]:
train_extractor

In [None]:
# limpando tudo que não é relativo ao restaurante
train_extractor = train_extractor[train_extractor['intent']!='Invalid']
test_extractor = test_extractor[test_extractor['intent']!='Invalid']
test_extractor

In [None]:
train_extractor['slot_values'][16] 

In [None]:
import math
import re
from collections import Counter

from nltk.util import ngrams

timepat = re.compile("\d{1,2}[:]\d{1,2}")
pricepat = re.compile("\d{1,3}[.]\d{1,2}")

replacements = []

def insertSpace(token, text):
    sidx = 0
    while True:
        sidx = text.find(token, sidx)
        if sidx == -1:
            break
        if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \
                re.match('[0-9]', text[sidx + 1]):
            sidx += 1
            continue
        if text[sidx - 1] != ' ':
            text = text[:sidx] + ' ' + text[sidx:]
            sidx += 1
        if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ':
            text = text[:sidx + 1] + ' ' + text[sidx + 1:]
        sidx += 1
    return text

def normalize(text):
    # lower case every word
    text = text.lower()

    # replace white spaces in front and end
    text = re.sub(r'^\s*|\s*$', '', text)

    # hotel domain pfb30
    text = re.sub(r"b&b", "bed and breakfast", text)
    text = re.sub(r"b and b", "bed and breakfast", text)

    # normalize phone number
    ms = re.findall('\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4,5})', text)
    if ms:
        sidx = 0
        for m in ms:
            sidx = text.find(m[0], sidx)
            if text[sidx - 1] == '(':
                sidx -= 1
            eidx = text.find(m[-1], sidx) + len(m[-1])
            text = text.replace(text[sidx:eidx], ''.join(m))

    # normalize postcode
    ms = re.findall('([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?[a-z]{1}[\. ]?[a-z]{1}|[a-z]{2}\d{2}[a-z]{2})',
                    text)
    if ms:
        sidx = 0
        for m in ms:
            sidx = text.find(m, sidx)
            eidx = sidx + len(m)
            text = text[:sidx] + re.sub('[,\. ]', '', m) + text[eidx:]

    # weird unicode bug
    text = re.sub(u"(\u2018|\u2019)", "'", text)

    value_time_list = re.findall(timepat, text)
    value_price_list = re.findall(pricepat, text)

    # replace time and and price
    text = re.sub(timepat, ' [value_time] ', text)
    text = re.sub(pricepat, ' [value_price] ', text)
    #text = re.sub(pricepat2, '[value_price]', text)

    # replace st.
    text = text.replace(';', ',')
    text = re.sub('$\/', '', text)
    text = text.replace('/', ' and ')

    # replace other special characters
    text = text.replace('-', ' ')
    text = re.sub('[\":\<>@\(\)]', '', text)

    # insert white space before and after tokens:
    for token in ['?', '.', ',', '!']:
        text = insertSpace(token, text)

    # insert white space for 's
    text = insertSpace('\'s', text)

    # replace it's, does't, you'd ... etc
    text = re.sub('^\'', '', text)
    text = re.sub('\'$', '', text)
    text = re.sub('\'\s', ' ', text)
    text = re.sub('\s\'', ' ', text)
    for fromx, tox in replacements:
        text = ' ' + text + ' '
        text = text.replace(fromx, tox)[1:-1]

    # remove multiple spaces
    text = re.sub(' +', ' ', text)

    # concatenate numbers
    tmp = text
    tokens = text.split()
    i = 1
    while i < len(tokens):
        if re.match(u'^\d+$', tokens[i]) and \
                re.match(u'\d+$', tokens[i - 1]):
            tokens[i - 1] += tokens[i]
            del tokens[i]
        else:
            i += 1
    text = ' '.join(tokens)

    #return time values
    time_iterator = 0

    while (time_iterator < len(value_time_list)):
      if (len(value_time_list[time_iterator]) < 5):
        value_time_list[time_iterator] = '0'+ value_time_list[time_iterator]
      time_iterator += 1
    
    word_list = text.split()
    word_iterator = 0
    
    vtl = value_time_list[:]
    vpl = value_price_list[:]

    while len(value_time_list) != 0:
      if (word_list[word_iterator] == '[value_time]'):
        word_list[word_iterator] = value_time_list[0]
        del value_time_list[0]
      
      word_iterator += 1
#     print(word_list)


    #return price values
    word_iterator = 0
    while len(value_price_list) != 0:
      if (word_list[word_iterator] == '[value_price]'):
        word_list[word_iterator] = value_price_list[0]
        del value_price_list[0]
      
      word_iterator += 1
        
    return ' '.join(word_list), vpl, vtl

In [None]:
n = normalize("Yes, I do. I'll need it booked for the same day, same people, and we'd like to eat at 19:00, with a of 10.00.")


In [None]:
n

In [None]:
def has_compound_word(values): #return compound words in slot values (pricerange values, area, bookday...)
  keys = []
  for key, val in values.items():
    for unit in val:
      if (' ' in unit):
        keys.append(key)
  
  return list(set(keys))

In [None]:
#Pre-processing data
all_chars = []
all_text = ' '.join([i for i in train_extractor['entry']]).lower()
# all_chars.append([i for i in all_text])
for i in range(len(all_text)):
    all_chars.append(all_text[i])
all_chars = np.unique(all_chars)

char_to_int = dict((c, i) for i, c in enumerate(all_chars))
int_to_char = dict((i, c) for i, c in enumerate(all_chars))

# todos os chars de todos os textos
n_chars = len(all_text)
# caracteres únicos
n_vocab = len(all_chars)

print("Todos os chars:", n_chars, "\n")
print("Vocabulario:", n_vocab, "\n")
print("Char to int\n", char_to_int, "\n")
print("Int to char\n", int_to_char)

In [None]:
from keras.utils import np_utils

In [None]:
#Prepares dataset where the input is sequence of 100 characters and target is next character.
seq_length = 100

dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
    #  uma espécie de janela que vai movendo de um em um,
    #  sempre pegando 100 chars (seq_in) e depois o char seguinte (seq_out)
    seq_in = all_text[i:i + seq_length]
    seq_out = all_text[i + seq_length]
    # o dataX vai ficar com listas de 100 em 100 chars
    dataX.append([char_to_int[char] for char in seq_in])
    # já o dataY, vai ficar com uma lista de chars    
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

# X = array de 100 em 100, sendo eles numeros representando cada um dos chars
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# one hot encodes the output variable
y = np_utils.to_categorical(dataY)

In [None]:
print(len(X[0]))
print(len(y[0]))

In [None]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense


In [None]:
# Passando dados para o modelo
# Mapeando cada palavra pra 100 um vetor de 100
# Usamos uma dropout layer para prevenir overfitting

embedding_dim =100
max_length =100
model = Sequential()
model.add(Embedding(n_vocab, embedding_dim, input_length=max_length))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
model.fit(X, y, epochs = 20, batch_size=128)

In [None]:
from sklearn.model_selection import train_test_split

train_data, valid_test_data = train_test_split(data_formatted,test_size=0.30, random_state=42)
valid_data, test_data = train_test_split(valid_test_data,test_size=0.50, random_state=42)

In [None]:
# training generator
def gen_train_series():

    for eg in train_data:
        yield eg[0],eg[1]

# validation generator
def gen_valid_series():

    for eg in valid_data:
        yield eg[0],eg[1]

# test generator
def gen_test_series():

    for eg in test_data:
        yield eg[0],eg[1]
  
# create Dataset objects for train, test and validation sets  
series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_valid = tf.data.Dataset.from_generator(gen_valid_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_test = tf.data.Dataset.from_generator(gen_test_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))

BATCH_SIZE = 128
BUFFER_SIZE=1000

# create padded batch series objects for train, test and validation sets
ds_series_batch = series.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_valid = series_valid.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_test = series_test.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)

# print example batches
for input_example_batch, target_example_batch in ds_series_batch_valid.take(1):
    print(input_example_batch)
    print(target_example_batch)

In [None]:
vocab_size = len(vocab)+1

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

label_size = len(labels)  

# build LSTM model
def build_model(vocab_size,label_size, embedding_dim, rnn_units, batch_size):
      model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[batch_size, None],mask_zero=True),
          tf.keras.layers.LSTM(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
          tf.keras.layers.Dense(label_size)
          ])
      return model

model = build_model(
      vocab_size = len(vocab)+1,
      label_size=len(labels)+1,
      embedding_dim=embedding_dim,
      rnn_units=rnn_units,
      batch_size=BATCH_SIZE)

model.summary()

In [None]:
print(vocab_size, label_size)

In [None]:
import os

# define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss,metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
history = model.fit(ds_series_batch, epochs=10, validation_data=ds_series_batch_valid,callbacks=[checkpoint_callback])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds = np.array([])
y_trues= np.array([])

# iterate through test set, make predictions based on trained model
for input_example_batch, target_example_batch in ds_series_batch_test:

  pred=model.predict_on_batch(input_example_batch)
  pred_max=tf.argmax(tf.nn.softmax(pred),2).numpy().flatten()
  y_true=target_example_batch.numpy().flatten()

  preds=np.concatenate([preds,pred_max])
  y_trues=np.concatenate([y_trues,y_true])

# remove padding from evaluation
remove_padding = [(p,y) for p,y in zip(preds,y_trues) if y!=0]

r_p = [x[0] for x in remove_padding]
r_t = [x[1] for x in remove_padding]

# print confusion matrix and classification report
print(confusion_matrix(r_p,r_t))
print(classification_report(r_p,r_t))

In [None]:
import seaborn as sn

df_cm = pd.DataFrame(confusion_matrix(r_p,r_t), index = ['0', 'O', 'restaurant-area', 'restaurant-bookday',
       'restaurant-bookpeople', 'restaurant-booktime', 'restaurant-food',
       'restaurant-name', 'restaurant-pricerange'], columns = ['0', 'O', 'restaurant-area', 'restaurant-bookday',
       'restaurant-bookpeople', 'restaurant-booktime', 'restaurant-food',
       'restaurant-name', 'restaurant-pricerange'])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, fmt="d")

In [None]:
test_extractor

In [None]:
labels