In [1]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0


In [2]:
#!pip install tensorflow_hub
#!pip install tensorflow_text
#!pip install tf-models-official

In [12]:
import numpy
import pandas as pd
all_df = pd.read_csv('./kaggle_train_dataset.csv', sep='\t', encoding='utf-8')

all_df["content"] = all_df["content"].fillna("NoName")
all_df["text"] = all_df.title + all_df.content

texts = all_df["text"].values
labels = all_df["label"].values
mydict = {'informative':"0", 'happy':"1", 'angry':"2", 'depressing':"3", 'odd':"4", 'boring':"5", 'warm':"6", 'worried':"7"}
final_label = []
for i in labels:
    final_label.append(mydict[i])

In [13]:
from tensorflow.keras import utils

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(texts, final_label, test_size=0.2, random_state=5)
y_trainOneHot = utils.to_categorical(y_train)
y_testOneHot = utils.to_categorical(y_test)

In [14]:
# from sklearn.utils import class_weight
# class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
# class_weights = dict(enumerate(class_weights))
# print(class_weights)

In [15]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tfhub_handle_preprocess = "https://hub.tensorflow.google.cn/tensorflow/bert_zh_preprocess/3"
tfhub_handle_encoder = "https://hub.tensorflow.google.cn/tensorflow/bert_zh_L-12_H-768_A-12/4"
    
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    encoder_inputs = preprocessing_layer(text_input)
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.5)(net)
    net = tf.keras.layers.Dense(8, activation='softmax', name='classifier')(net)
    return tf.keras.Model(text_input, net)

model = build_classifier_model()
epochs = 10
batch_size = 32
num_train_steps= int(len(x_train) / batch_size * epochs)
num_warmup_steps = int(0.1*num_train_steps)

optimizer = optimization.create_optimizer(init_lr=3e-5, 
                                          num_train_steps=num_train_steps, 
                                          num_warmup_steps=num_warmup_steps, 
                                          optimizer_type='adamw')

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.CategoricalCrossentropy(), 
              metrics=tf.metrics.CategoricalAccuracy())

In [16]:
import numpy as np
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
filepath="weights.best.hdf5"
callbacks_list = [
    EarlyStopping(verbose=True, patience=5, monitor='val_categorical_accuracy'),
    ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
]

train_history = model.fit(x_train, y_trainOneHot, 
                          batch_size = batch_size, epochs = epochs, verbose = 1, 
                          validation_split=0.1, callbacks=callbacks_list)

Epoch 1/10
Epoch 1: val_categorical_accuracy improved from -inf to 0.62053, saving model to weights.best.hdf5
Epoch 2/10
Epoch 2: val_categorical_accuracy improved from 0.62053 to 0.67324, saving model to weights.best.hdf5
Epoch 3/10
Epoch 3: val_categorical_accuracy improved from 0.67324 to 0.68744, saving model to weights.best.hdf5
Epoch 4/10
Epoch 4: val_categorical_accuracy improved from 0.68744 to 0.70761, saving model to weights.best.hdf5
Epoch 5/10
Epoch 5: val_categorical_accuracy improved from 0.70761 to 0.71219, saving model to weights.best.hdf5
Epoch 6/10
Epoch 6: val_categorical_accuracy did not improve from 0.71219
Epoch 7/10
Epoch 7: val_categorical_accuracy did not improve from 0.71219
Epoch 8/10
Epoch 8: val_categorical_accuracy did not improve from 0.71219
Epoch 9/10
Epoch 9: val_categorical_accuracy did not improve from 0.71219
Epoch 10/10
Epoch 10: val_categorical_accuracy improved from 0.71219 to 0.72090, saving model to weights.best.hdf5


In [17]:
import numpy as np
from sklearn import metrics
result = model.predict(x_test).argmax(axis=-1)
result = np.array(result, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

print("Macro-average: {0}".format(metrics.f1_score(y_test, result, average = 'macro')))
print("Micro-average: {0}".format(metrics.f1_score(y_test, result, average = 'micro')))
print(metrics.classification_report(y_test, result))
print(metrics.confusion_matrix(y_test, result))
print(metrics.accuracy_score(y_test, result))

Macro-average: 0.7045545490902929
Micro-average: 0.7244729605866178
              precision    recall  f1-score   support

         0.0       0.78      0.76      0.77      2373
         1.0       0.64      0.67      0.66      1047
         2.0       0.69      0.70      0.69       722
         3.0       0.77      0.79      0.78       393
         4.0       0.65      0.70      0.68       365
         5.0       0.72      0.65      0.68       334
         6.0       0.77      0.80      0.78       163
         7.0       0.57      0.62      0.60        58

    accuracy                           0.72      5455
   macro avg       0.70      0.71      0.70      5455
weighted avg       0.73      0.72      0.73      5455

[[1793  266  128   47   49   39   32   19]
 [ 221  704   46   22   24   23    5    2]
 [ 115   41  505    7   40   12    0    2]
 [  40   15   12  311    8    4    2    1]
 [  46   24   27    2  257    6    0    3]
 [  48   42   14    6    8  216    0    0]
 [  15    9    0    5  

### Generate submission file

In [9]:
df_test = pd.read_csv('./kaggle_test_dataset.csv', sep='\t', encoding='utf-8')
df_test["content"] = df_test["content"].fillna("NoName")
df_test["text"] = df_test.title + df_test.content
TEST_features = df_test["text"].values

In [10]:
model.load_weights("weights.best.hdf5")
TEST_predict_result = model.predict(TEST_features).argmax(axis=-1)



In [11]:
with open("./kaggle_submission.csv", "w", encoding="utf-8") as f:
    f.write(f'Id,Label\n')
    for i in range(len(TEST_predict_result)):
        f.write(f'{i},{TEST_predict_result[i]}\n')