<a href="https://colab.research.google.com/github/carlosinator/cil-sentiment/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Setup

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
%%bash
pip3 install transformers emoji==0.6.0 keras_nlp

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.2/7.2 MB 85.4 MB/s eta 0:00:00
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.0/51.0 kB 6.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting keras_nlp
  Downloading keras_nlp-0.6.0-py3-none-any.whl (576 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 576.5/576.5 kB 57.4 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 30.2 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 83.9 M

In [3]:
!git clone https://github.com/carlosinator/cil-sentiment.git

Cloning into 'cil-sentiment'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 95 (delta 42), reused 17 (delta 4), pack-reused 0[K
Unpacking objects: 100% (95/95), 130.67 KiB | 2.72 MiB/s, done.


In [4]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras_nlp
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
import transformers
from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForSequenceClassification
import matplotlib.pyplot as plt
import pickle
import re
import subprocess as sp
import os
from threading import Thread , Timer
import sched, time

import sys
sys.path.append("./cil-sentiment/models")
sys.path.append("./cil-sentiment/")
from gru_models import GRUModel, VGRUModel
import utils

# reproducibility
transformers.set_seed(0) # sets the seed in random, numpy, and tf

Using TensorFlow backend


In [5]:
!gsutil cp "gs://cil_2023/train_pos_full_preprocessed_without_duplicates.txt" .
!gsutil cp "gs://cil_2023/train_neg_full_preprocessed_without_duplicates.txt" .

model_name = "vinai/bertweet-base"
filename_train_pos = "train_pos_full_preprocessed_without_duplicates.txt"
filename_train_neg = "train_neg_full_preprocessed_without_duplicates.txt"

# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.TPUStrategy(tpu)

Copying gs://cil_2023/train_pos_full_preprocessed_without_duplicates.txt...
\ [1 files][ 74.6 MiB/ 74.6 MiB]                                                
Operation completed over 1 objects/74.6 MiB.                                     
Copying gs://cil_2023/train_neg_full_preprocessed_without_duplicates.txt...
|
Operation completed over 1 objects/91.6 MiB.                                     


In [6]:
# read dataset
dataset_pos_pd = pd.read_table(filename_train_pos, sep='\r\n', header=None, names=['text'])
dataset_neg_pd = pd.read_table(filename_train_neg, sep='\r\n', header=None, names=['text'])
dataset_pos_pd['label'] = 0
dataset_neg_pd['label'] = 1
dataset_pd = pd.concat([dataset_pos_pd, dataset_neg_pd])

# shuffle
dataset_pd = dataset_pd.sample(frac=1, random_state=0).reset_index(drop=True)

# tokenize data set
tokenizer = AutoTokenizer.from_pretrained(model_name)
texts = tokenizer.batch_encode_plus(dataset_pd['text'].tolist(),
                                    padding=True, truncation=True,
                                    return_tensors='tf')

dataset = tf.data.Dataset.from_tensor_slices((dict(texts), dataset_pd['label']))

# split training / validation
batch_size = 1024 # * tpu_strategy.num_replicas_in_sync
AUTOTUNE = tf.data.experimental.AUTOTUNE

val_data_size = int(0.1 * len(dataset_pd.index))
test_data_size = int(0.1 * len(dataset_pd.index))
train_data_size = len(dataset_pd.index) - val_data_size - test_data_size
val_ds = dataset.take(val_data_size).batch(batch_size, drop_remainder=True)
test_ds = dataset.take(test_data_size).batch(batch_size, drop_remainder=True)
train_ds = dataset.skip(val_data_size).batch(batch_size, drop_remainder=True)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)

  dataset_pos_pd = pd.read_table(filename_train_pos, sep='\r\n', header=None, names=['text'])
  dataset_neg_pd = pd.read_table(filename_train_neg, sep='\r\n', header=None, names=['text'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Train model for inference

In [7]:
import subprocess as sp
import os
from threading import Thread , Timer
import sched, time

gpu_hist = {
    "mib" : [],
    "percent" : [],
    "counter" : 0,
}

def get_gpu_memory():

    out = !nvidia-smi --query-gpu=memory.used,utilization.gpu --format=csv

    float_pattern = r'\d+\.\d+|\d+'
    numbers = re.findall(float_pattern, out[1])

    # Extract the MiB value and the percentage
    mib_val = float(numbers[0])
    perc = float(numbers[1])

    gpu_hist["counter"] += 1

    # print(mib_val, perc)

    gpu_hist["mib"].append(mib_val)
    gpu_hist["percent"].append(perc)

    return


def track_gpu_mem(interval=5.0, dictname="gpu_hist.pkl"):
    """
        This function calls itself every 5 secs and print the gpu_memory.
    """
    thd = Timer(interval, track_gpu_mem)
    thd.start()
    get_gpu_memory()

    gpu_hist["interval"] = interval

    with open(dictname, 'wb') as f:
      pickle.dump(gpu_hist, f)

    return thd

In [None]:
USE_MODEL = "read"
LEARNING_RATE = 1e-3
EPOCHS = 1
model = utils.get_model(model_name, LEARNING_RATE, USE_MODEL)
history = {}

run_name = "inference_" + USE_MODEL + "_fullmodel"
hd_name = run_name + "_dict.pkl"
gpu_hist_name = "gpu_hist_" + USE_MODEL + ".pkl"

In [None]:
callback = tf.keras.callbacks.EarlyStopping(patience=3) # early stopping def
gpu_mem_proc = track_gpu_mem(10.0, dictname=gpu_hist_name) # initialize gpu tracking

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    verbose=1,
    callbacks=[callback]
    )

gpu_mem_proc.join()

In [None]:
# model.save(run_name)
# with open(hd_name, 'wb') as f:
#     pickle.dump(history, f)

!gsutil cp -r {run_name + "/"} "gs://cil_2023/models/"
!gsutil cp {hd_name} "gs://cil_2023/models/"
!gsutil cp {gpu_hist_name} "gs://cil_2023/models/"

# Retrieve model for inference

In [None]:
# uncomment if trained_model is not on local machine
# !gsutil cp -r {"gs://cil_2023/models/" + run_name} .
trained_model = tf.keras.models.load_model(run_name)
trained_model.summary()

# Test accuracy

In [None]:
preds = trained_model.predict(test_ds)

if USE_MODEL == "basemodel":
  preds = tf.keras.layers.Softmax()(preds)