<a href="https://colab.research.google.com/github/ameasure/colab_tutorials/blob/master/Product_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries
* tensorflow 2
* transformers

In [0]:
# upgrade to tensorflow 2
pip install --upgrade tensorflow-gpu

Requirement already up-to-date: tensorflow-gpu in /usr/local/lib/python3.6/dist-packages (2.0.0)


In [0]:
# install huggingface transformers library
pip install transformers



# Read in Data
Separate into training and validation.

In [0]:
import pandas as pd

df = pd.read_excel(r'/content/Stats Poland ECOICOP data translated to English and French.xlsx')
df_train = df.sample(frac=.8)
df_valid = df[~df.index.isin(df_train.index)]

In [0]:
print(len(df_train))
print(len(df_valid))

13679
3420


# Load pretrained Tokenizer and Model

In [0]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
pt_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Tokenize and Pad Data

In [0]:
import numpy as np

def pad_sequences(texts, max_seq_len):
  token_lengths = []
  X = []
  for text in texts:
    tokens = tokenizer.encode(text, add_special_tokens=True)
    token_lengths.append(len(tokens))
    # pad to max_seq_len
    while len(tokens) < max_seq_len:
      tokens.append(tokenizer.pad_token_id)
    # chop down to max_seq_len
    tokens = tokens[: max_seq_len]
    assert len(tokens) >= max_seq_len, f'{len(tokens)} !>= {max_seq_len}, {tokens}'
    # if the last token is not padding or a separator we truncated and need to add a separator
    if tokens[-1] not in [tokenizer.pad_token_id, tokenizer.sep_token_id]:
      tokens = tokens[0: max_seq_len-1]
      tokens.append(tokenizer.sep_token_id)
    assert len(tokens) == max_seq_len, f'{len(tokens)} != {max_seq_len}, {tokens}'
    X.append(np.array(tokens))
  print(f'max token length: {np.max(token_lengths)}')
  return np.stack(X)

X_train = pad_sequences(df_train['Desc_E'], max_seq_len=40)
X_valid = pad_sequences(df_valid['Desc_E'], max_seq_len=40)

max token length: 36
max token length: 36


# Convert Codes to Indexes

In [0]:
from sklearn.preprocessing import LabelEncoder

# Convert codes to indexes
le = LabelEncoder()
le.fit(df['Code_E'])
y_train = le.transform(df_train['Code_E'])
y_valid = le.transform(df_valid['Code_E'])

# Specify Model

In [0]:
import tensorflow as tf
from tensorflow.python.keras.utils.data_utils import Sequence
from tensorflow.python.keras.layers import Layer

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Create custom model from pretrained model and other pieces
class MyNet(tf.keras.Model):
  def __init__(self, output_dim, **kwargs):
    super(MyNet, self).__init__(**kwargs)
    self.model = pt_model.layers[0]
    self.mean = tf.keras.layers.GlobalAveragePooling1D()
    self.do = tf.keras.layers.Dropout(rate=.5)
    self.dense = tf.keras.layers.Dense(output_dim, activation='softmax')

  def call(self, x):
    x = self.model(x)
    x = self.mean(x[0])
    x = self.do(x)
    x = self.dense(x)
    return x

model = MyNet(output_dim=len(le.classes_))
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the Model

In [0]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(x=X_train, y=y_train, epochs=10, batch_size=256,
                    validation_data=(X_valid, y_valid))

Train on 13679 samples, validate on 3420 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(x=X_train, y=y_train, epochs=10, batch_size=128,
                    validation_data=(X_valid, y_valid))

Train on 13679 samples, validate on 3420 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(x=X_train, y=y_train, epochs=10, batch_size=128,
                    validation_data=(X_valid, y_valid))

Train on 13679 samples, validate on 3420 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
