In [None]:
!pip install transformers

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'# This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 16
N_EPOCHS = 3 # we can put more, because evaluation of the model shows big difference in loss with accuracy 1.0



## Prepare data

In [None]:
loaded_df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
# df = loaded_df[['review', 'rating']]
df = loaded_df[:10000]


def get_sentiment(rating):
  if rating < 4.0:
    return 'neg'
  elif rating >= 4.0 and rating <= 7.0:
    return 'neutral'
  else:
    return 'pos'

df['sentiment'] = df['rating'].map(lambda x: get_sentiment(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# encode class names to integers
labelencoder = preprocessing.LabelEncoder()
labels = labelencoder.fit_transform(df['sentiment'])

cat_labels = to_categorical(labels)

train_texts, val_texts, train_labels, val_labels = train_test_split(df['review'], cat_labels, random_state=1)

train_texts = train_texts.to_list()
val_texts = val_texts.to_list()

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')   # switch to MODEL_NAME
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
# val_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(val_encodings),
#     val_labels
# )) 

In [None]:
# Load test data
loaded_test_df = pd.read_csv('drugsComTest_raw.tsv', sep='\t')
test_df = loaded_test_df[:2000]

test_df['sentiment'] = test_df['rating'].map(lambda x: get_sentiment(x))

# enocde test labels
labelencoder = preprocessing.LabelEncoder()
labels = labelencoder.fit_transform(test_df['sentiment'])

cat_labels = to_categorical(labels)

X_test = test_df['review']
X_test = X_test.to_list()

y_test = cat_labels

# tokenzie and encode dataset
test_encodings = tokenizer(X_test, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

test_dataset




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(3,), dtype=tf.float32, name=None))>

## Built and evalaute model

In [None]:
def evaluate_model(model_obj):
    # get predictions with test set
    model2_pred = model_obj.predict(test_dataset.batch(16))
    model2_pred_labels = np.argmax(model2_pred.logits, axis=1)
    print('model2_pred_labels:', model2_pred_labels)
    y_test_labels = np.argmax(y_test, axis=1)
    print('y_test_labels:', y_test_labels)
    print(classification_report(y_test_labels, model2_pred_labels,target_names=labelencoder.classes_))


In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_39', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 66,955,779
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_encodings_small = tokenizer(train_texts[:2000], truncation=True, padding=True)
# val_encodings_small = tokenizer(val_texts[:2000], truncation=True, padding=True)


In [None]:
train_dataset_small = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings_small),
    train_labels[:2000]
))
# val_dataset_small = tf.data.Dataset.from_tensor_slices((
#     dict(val_encodings_small),
#     val_labels[:2000]
# )) 

DistilBERT

In [None]:
model.fit(train_dataset_small.batch(16),
          epochs=2,
          batch_size=BATCH_SIZE)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1250643610>

In [None]:
evaluate_model(model)

model2_pred_labels: [2 2 2 ... 2 2 2]
y_test_labels: [2 2 2 ... 2 0 2]
              precision    recall  f1-score   support

         neg       0.67      0.47      0.55       446
     neutral       0.00      0.00      0.00       348
         pos       0.69      0.96      0.80      1206

    accuracy                           0.69      2000
   macro avg       0.45      0.48      0.45      2000
weighted avg       0.56      0.69      0.61      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Models below were other variations

In [None]:
model_am2 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model_am2.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

model_am2.fit(train_dataset_small.batch(16),
          epochs=1,
          batch_size=BATCH_SIZE)
evaluate_model(model_am2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use i

model2_pred_labels: [2 2 2 ... 2 2 2]
y_test_labels: [2 2 2 ... 2 0 2]
              precision    recall  f1-score   support

         neg       0.00      0.00      0.00       446
     neutral       0.00      0.00      0.00       348
         pos       0.60      1.00      0.75      1206

    accuracy                           0.60      2000
   macro avg       0.20      0.33      0.25      2000
weighted avg       0.36      0.60      0.45      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model2 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
# losss = tf.keras.losses.SparseCategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model2.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model2.fit(train_dataset_small.shuffle(2000).batch(16), 
          epochs=4,
          batch_size=BATCH_SIZE)

# model2.fit(train_dataset.shuffle(2000).batch(16), 
#           epochs=4,
#           batch_size=BATCH_SIZE)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f8b827d0050>

In [None]:
evaluate_model(model2)

model2_pred_labels: [1 0 0 ... 1 0 0]
y_test_labels: [2 2 2 ... 2 0 2]
              precision    recall  f1-score   support

         neg       0.18      0.62      0.28       446
     neutral       0.19      0.24      0.21       348
         pos       0.00      0.00      0.00      1206

    accuracy                           0.18      2000
   macro avg       0.12      0.28      0.16      2000
weighted avg       0.07      0.18      0.10      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Save model

In [None]:
# SAVE MODEL  ( https://huggingface.co/docs/transformers/main_classes/model )
model.save_pretrained('04-09-distilbert_model_epoch2_batch16')
!zip -r 04-09-distilbert_model_epoch2_batch16.zip 04-09-distilbert_model_epoch2_batch16

  adding: 04-09-distilbert_model_epoch2_batch16/ (stored 0%)
  adding: 04-09-distilbert_model_epoch2_batch16/config.json (deflated 48%)
  adding: 04-09-distilbert_model_epoch2_batch16/tf_model.h5 (deflated 8%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# with open('/content/drive/My Drive/NLP Models/foo.txt', 'w') as f:
#   f.write('Hello Google Drive!')
# !cat /content/drive/My\ Drive/NLP\ Models/foo.txt

!cp 04-09-distilbert_model_epoch2_batch16.zip "/content/drive/My Drive/NLP Models/04-09-distilbert_model_epoch2_batch16.zip"

Mounted at /content/drive


In [None]:
# Download to my local computer

from google.colab import files
files.download('04-09-distilbert_model_epoch2_batch16.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Evaluate model with test data

In [None]:
model2.save('61_acc_distilbert.model')





INFO:tensorflow:Assets written to: 61_acc_distilbert.model/assets


INFO:tensorflow:Assets written to: 61_acc_distilbert.model/assets


In [None]:
model3 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
# losss = tf.keras.losses.SparseCategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model3.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

model3.fit(train_dataset_small.shuffle(2000).batch(16), 
          epochs=12,
          batch_size=BATCH_SIZE)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_179', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12


KeyboardInterrupt: ignored

In [None]:
model4 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
# losss = tf.keras.losses.SparseCategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model4.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

model4.fit(train_dataset_small.shuffle(2000).batch(16), 
          epochs=12,
          batch_size=BATCH_SIZE)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_199']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f6af5adf4d0>

In [None]:
model5 = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)
losss = tf.keras.losses.CategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
# losss = tf.keras.losses.SparseCategoricalCrossentropy() # Computes the crossentropy loss between the labels and predictions. 
model5.compile(optimizer=optimizer, loss=losss, metrics=['categorical_accuracy'])

train_sample = train_dataset.take(2000)

model5.fit(train_sample.shuffle(2000).batch(8), 
          epochs=3,
          batch_size=8)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_299', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6a6addb310>

In [None]:
train_dataset.take(2000)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(3,), dtype=tf.float32, name=None))>

In [None]:
print(train_dataset.take(2000).as_numpy_iterator())

<tensorflow.python.data.ops.dataset_ops._NumpyIterator object at 0x7f6a6ab7c410>
