V1 uses accuracy as performance metric

# Imports

In [None]:
!nvidia-smi

Tue Nov 29 06:02:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    43W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet

[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
[K     |████████████████████████████████| 182 kB 79.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 67.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 4.7 MB/s 
[?25h

In [None]:
import os
import sys
import time
import string
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc

from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model, load_model
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, \
  LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError
from transformers import BertTokenizer, TFBertModel, AutoModel, AutoTokenizer
from transformers import DebertaConfig, DebertaTokenizer, TFDebertaModel
import transformers

## Global Variables and Google Drive Connect

In [None]:
TESTING = False # use to truncate training data in order to speed up development

RANDOM_STATE = 42
SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
USER = 'Kurt'
RUBRIC_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
PATIENCE = 2
LEARNING_RATE = 0.0001
LEARNING_RATE_DECAY = 0.1
EPOCHS = 4
# Uncomment these two lines if you want to be able to be able to repeat calculations exactly on the same hardware
# However, the model will run more slowly (approx 1/3 speed)
# tf.keras.utils.set_random_seed(RANDOM_STATE)
# tf.config.experimental.enable_op_determinism()

In [None]:
# Mount drive where you will do your work.
drive.mount('/content/drive')
if USER == 'Alex': 
  root_dir = "/content/drive/MyDrive/w266/"
  project_folder = "Final_Project/"
elif USER == 'Kurt':
  root_dir = "/content/drive/My Drive/266/"
  project_folder = "Final Project/"
elif USER == 'Tom':
  root_dir = "/content/drive/My Drive/UC Berkeley/W266 Natural Language Processing/"
  project_folder = "Final Project/"
else:
  raise Exception("User unrecognized, must connect to shared drive")

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  # to test if all is working, you can uncomment these two lines below--it should write a file to the shared drive
  # !touch 'new_file_in_working_directory.txt'
  # print('\nYour working directory was changed to ' + root_dir + project_folder + \
  #       "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

os.chdir(root_dir + project_folder)

Mounted at /content/drive


# Load Data and Create Data Sets

In [None]:
#Pull training data with all columns
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')

#pull validation data with all columns 
X_val = pd.read_csv('data/processed/X_val.csv')
y_val = pd.read_csv('data/processed/y_val.csv')

#pull test data with all columns
# X_test = pd.read_csv('data/processed/X_test.csv')
# y_test = pd.read_csv('data/processed/y_test.csv')

#drop all non-text columns and concatenate train and val into one dataset
train_data_from_file = pd.merge(X_train, y_train, on='essay_index', how='outer')
validation_data_from_file = pd.merge(X_val, y_val, on='essay_index', how='outer')
train_data_from_file.rename(columns={'vocabulary_y':'vocabulary'}, inplace= True)
validation_data_from_file.rename(columns={'vocabulary_y':'vocabulary'}, inplace= True)


In [None]:
train_data_from_file.columns

Index(['Unnamed: 0_x', 'essay_index', 'text_id_x', 'full_text', 'spacy_doc',
       'paragraph_count', 'punctuation_count', 'character_count',
       'contraction_count', 'SCONJ', 'NUM', 'ADV', 'PRON', 'DET', 'AUX', 'ADJ',
       'SYM', 'CCONJ', 'PART', 'INTJ', 'PROPN', 'X', 'NOUN', 'SPACE', 'VERB',
       'PUNCT', 'ADP', 'syllable_count', 'words', 'word_count', 'title_count',
       'mean_word_length', 'variance_word_length', 'vocabulary_x', 'stopwords',
       'stopword_count', 'sentence_count', 'mean_sentence_length',
       'variance_sentence_length', 'polarity', 'subjectivity', 'fk_score',
       'Unnamed: 0_y', 'text_id_y', 'cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions'],
      dtype='object')

In [None]:
# Other useful helper functions
def scores_to_ints(x):
  return (x-1.0)*2  # note this actually returns a float but is converted to int with astype later

def ints_to_scores(x):
  return (int(x)/2.0)+1.0

ints_to_scores_vectorized = np.vectorize(ints_to_scores)

def select_data(train_data, val_data, rubric_col):
    train_data = train_data[['full_text',rubric_col]]
    train_data[rubric_col] = train_data[rubric_col].apply(scores_to_ints).astype(int)
    val_data = val_data[['full_text',rubric_col]]
    val_data[rubric_col] = val_data[rubric_col].apply(scores_to_ints).astype(int)
    
    return train_data, val_data

In [None]:
if TESTING:
  train_size = 250
  val_size = 3

  print("=========================================\nIN TESTING MODE\n=========================================")

else:
  train_size = 2347
  val_size = 782

train_data_from_file = train_data_from_file[:train_size]
validation_data_from_file = validation_data_from_file[:val_size]

print("input_data size is: {}".format(train_data_from_file.shape))
print("validation data size is: {}".format(validation_data_from_file.shape))

input_data size is: (2347, 50)
validation data size is: (782, 50)


## Tokenize Data

In [None]:
deberta_model = AutoModel.from_pretrained("microsoft/deberta-v3-base")
deberta_tokenizer = transformers.AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# bert_model = transformers.TFAutoModel.from_pretrained('bert-base-cased') # changed from deberta_base_fresh
# bert_tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-cased') # changed from deberta_base_fresh

auto_tune = tf.data.experimental.AUTOTUNE

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def bert_encode(texts, tokenizer, attn_mask):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=SEQUENCE_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    if attn_mask:
      return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")
    else:
      return np.array(input_ids, dtype="int32")


def get_data(df, rubric_col, attn_mask=True): # changed
    inputs = bert_encode(df['full_text'], deberta_tokenizer, attn_mask)  
    targets = np.array(df[rubric_col], dtype="float32") # changed
    return inputs, targets

# Loss and Metrics Functions

In [None]:
# Using Huber loss which is less sensitive to outliers/edge cases
def huber_loss(y_true, y_pred, clip_delta=1.0):
  error = y_true - y_pred
  cond  = tf.keras.backend.abs(error) < clip_delta

  squared_loss = 0.5 * tf.keras.backend.square(error)
  linear_loss  = clip_delta * (tf.keras.backend.abs(error) - 0.5 * clip_delta)

  return tf.where(cond, squared_loss, linear_loss)

keras.losses.huber_loss = huber_loss

In [None]:
# Custom metric function MCRMSE : column wise root mean squared eoor
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=-1, keepdims=True)

# Callbacks and LR 

In [None]:
#early stopping
earlystopper = tf.keras.callbacks.EarlyStopping(
      monitor='val_loss', patience = PATIENCE,
      restore_best_weights=True)

In [None]:
#learning rate schedule
def lr_scheduler(epoch, lr):
    
    if epoch < 7:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

# Model Configurations

In [None]:
base_deberta_config = dict(
    # RUBRIC_COLS = ['cohesion', 'syntax', 'vocabulary', 
    #                'phraseology', 'grammar', 'conventions'], # changed
    batch_size = BATCH_SIZE,
    model_name = 'base_deberta',
    epochs = EPOCHS,
    init_learning_rate = LEARNING_RATE,
    lr_decay_rate = LEARNING_RATE_DECAY,
    optimizer = 'adam',
    loss_fn = huber_loss,
    metrics = 'accuracy', # changed
    earlystopping_patience = PATIENCE
)

In [None]:
cfg = transformers.AutoConfig.from_pretrained("microsoft/deberta-v3-base", output_hidden_states=True)
cfg.hidden_dropout_prob = 0.3 # changed
cfg.attention_probs_dropout_prob = 0.3 # changed
# cfg.save_pretrained('./tokenizer/')

## deberta Experiments

#### deberta with pooled output

In [None]:
def create_deberta_model(deberta_model,
                      dropout = 0.3):

    # Read in deberta model's outputs
    input_ids = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    attention_masks = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

    deberta_output = deberta_model.deberta(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = deberta_output.last_hidden_state

    x = tf.keras.layers.GlobalAveragePooling1D()(hidden_states)
    x = tf.keras.layers.LayerNormalization()(x)
    x = tf.keras.layers.Dropout(dropout)(x) 

    # # Prediction layer - predict cohesion via nomial classification
    output = tf.keras.layers.Dense(9, activation='softmax', name='classification_layer')(x) # changed

    # # Make and compile model
    model = tf.keras.models.Model(inputs=(input_ids, attention_masks), outputs=[output]) 
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),  # changed
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), # changed
                    metrics='accuracy') # changed
       
    return model

# Building Models

In [None]:
tf.keras.backend.clear_session()

deberta_model = transformers.TFAutoModel.from_pretrained("microsoft/deberta-v3-base", config=cfg)
deberta_pooled_model = create_deberta_model(deberta_model,
                                      dropout = 0.3)

tf.keras.backend.clear_session()

gc.collect()

Downloading:   0%|          | 0.00/736M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at microsoft/deberta-v3-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


44675

# Training Models

In [None]:
def train_model(model,
                train_df,
                val_df,
                config: dict,
                callbacks: list,
                verbose: int=0):
  
    # Initalize model
    tf.keras.backend.clear_session()
    callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience = PATIENCE , restore_best_weights=True) # changed

    model_history = model.fit(
      x=train_df[0],
      y=train_df[1],
      validation_data = val_df,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS, # changed from EPOCHS
      shuffle = True, 
      callbacks = [callback]
      )

    return model_history

In [None]:
%%time

pd.options.mode.chained_assignment = None

RMSEs = list()

for rubric_col in RUBRIC_COLS:
    print('\n\n////////////////////////////////////////////////////////////////////////////////////')
    print(f'\nNow training on {rubric_col}...\n')
    train_data, val_data = select_data(train_data_from_file, validation_data_from_file, rubric_col)
    train_dataset = get_data(train_data, rubric_col)
    val_dataset = get_data(val_data, rubric_col)

    callbacks = [earlystopper]

    tf.keras.backend.clear_session()

    db_last_hidden_model = train_model(model=deberta_pooled_model,
                            train_df = train_dataset, 
                            val_df = val_dataset, 
                            config=base_deberta_config, 
                            callbacks=callbacks, 
                            verbose=1)

    tf.keras.backend.clear_session()

    preds_proba = deberta_pooled_model.predict(val_dataset[0])
    preds_0_9 = tf.argmax(preds_proba, axis=-1)
    ints_to_scores_vectorized = np.vectorize(ints_to_scores)
    preds_1_5 = ints_to_scores_vectorized(np.array(preds_0_9).astype(int))
    val_labels = ints_to_scores_vectorized(val_dataset[1].astype(int))
    RMSEs.append(sklearn.metrics.mean_squared_error(val_labels, preds_1_5, squared=False))
    

mcrmse_final = np.array(RMSEs).mean()
mcrmse_final



////////////////////////////////////////////////////////////////////////////////////

Now training on cohesion...

Epoch 1/4


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Epoch 2/4
Epoch 3/4
Epoch 4/4


////////////////////////////////////////////////////////////////////////////////////

Now training on syntax...

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


////////////////////////////////////////////////////////////////////////////////////

Now training on vocabulary...

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


////////////////////////////////////////////////////////////////////////////////////

Now training on phraseology...

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


////////////////////////////////////////////////////////////////////////////////////

Now training on grammar...

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


////////////////////////////////////////////////////////////////////////////////////

Now training on conventions...

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 2h 12min 41s, sys: 11min 8s, total: 2h 23min 49s
Wall time: 1h 19min 35s


0.6932015943317213

In [None]:
RMSEs

[0.6559778238407484,
 0.604226376506766,
 0.5012771412886455,
 0.8741089727051673,
 0.7810085944164953,
 0.7426106572325057]

# Prediction + Submission

In [None]:
# preds_proba = bert_pooled_model.predict(val_dataset[0])
# preds_proba

In [None]:
# preds_0_9 = tf.argmax(preds_proba, axis=-1)
# preds_0_9[:20]

In [None]:
# ints_to_scores_vectorized = np.vectorize(ints_to_scores)

In [None]:
# preds_1_5 = ints_to_scores_vectorized(np.array(preds_0_9).astype(int))
# preds_1_5[:20]

In [None]:
# val_labels = ints_to_scores_vectorized(val_dataset[1].astype(int))
# val_labels[:20]

In [None]:
# sklearn.metrics.mean_squared_error(val_labels, preds_1_5, squared=False)