In [2]:
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
pd.set_option('display.max_colwidth', None)
MODEL_NAME = '../input/distilbertbaseuncasedfinetunedsst2english/distilbert-base-uncased-finetuned-sst-2-english'
BATCH_SIZE = 32
N_EPOCHS = 1

In [3]:
train_df = pd.read_csv("input-file.csv")

In [4]:
tgt_num = dict(enumerate(train_df['discourse_type'].unique().tolist()))
tgt_txt = {v: k for k, v in tgt_num.items()}
print(tgt_num)
print(tgt_txt)

In [5]:
train_df['targets'] = train_df['discourse_type'].map(tgt_txt)
print(train_df.head(1))

In [6]:
training_data = train_df.sample(frac=0.8, random_state=25)
testing_data = train_df.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [7]:
X_train = training_data.discourse_text
X_test  = testing_data.discourse_text
Y_train = training_data.targets
Y_test  = testing_data.targets


In [8]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, 

In [9]:
#X_train = X_train.apply(lambda x: str(x[0], 'utf-8'))
#X_test = X_test.apply(lambda x:  str(x[0], 'utf-8'))

In [10]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
#tokenize the text
train_encodings = tokenizer(list(X_train.values),
                            truncation=True, 
                            padding=True)
test_encodings = tokenizer(list(X_test.values),
                           truncation=True, 
                           padding=True)

In [11]:
print(f'1st text: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(Y_train.values)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(Y_test.values)))

In [13]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
#chose the optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
#define the loss function 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#build the model
model.compile(optimizer=optimizerr,
              loss=losss,
              metrics=['accuracy'])
# train the model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

In [None]:
# model evaluation on the test set
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), 
               return_dict=True, 
               batch_size=BATCH_SIZE)


In [None]:
def predict_proba(text_list, model, tokenizer):  
    #tokenize the text
    encodings = tokenizer(text_list, 
                          max_length=MAX_LEN, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res
