<a href="https://colab.research.google.com/github/alliarnold/llm-fall-2023/blob/main/MultiClass_Bert_Magic4Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
#tqdm is a progress bar
from tqdm import tqdm
from transformers import BertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

magic_4 = pd.read_csv('/content/drive/My Drive/GC-CUNY/2023_Fall/4answerdata.csv')
magic_4.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,alli_q,answer_4
0,Am I a bad communicator?,Cannot predict now
1,Am I a bad dancer?,Cannot predict now
2,Am I a bad listener?,Cannot predict now
3,Am I a good person?,Cannot predict now
4,Am I allergic to peanuts?,Cannot predict now


In [None]:
def c2n(answer):
  if answer=="Cannot predict now":
    return 0
  elif answer=="Outlook good":
    return 1
  elif answer=="Signs point to yes":
    return 2
  else:
    return 3

magic_4['answer_4'] = magic_4['answer_4'].apply(c2n)
magic_4.head()

Unnamed: 0,alli_q,answer_4
0,Am I a bad communicator?,0
1,Am I a bad dancer?,0
2,Am I a bad listener?,0
3,Am I a good person?,0
4,Am I allergic to peanuts?,0


In [None]:
magic_4.info()

In [None]:
magic_4 = magic_4.sample(frac = 1) # shuffling order
magic_4[:100]

In [None]:
magic_4['answer_4'].value_counts()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#video used regulared cased

In [None]:
token = tokenizer.encode_plus(
    magic_4['alli_q'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors="tf"
)

In [None]:
token #making sure it worked

In [None]:
X_input_ids = np.zeros((len(magic_4), 256))
X_attn_masks = np.zeros((len(magic_4), 256))

In [None]:
X_input_ids.shape # checks

In [None]:
def generate_training_data(magic_4, ids, masks, tokenizer):
  for i, text in tqdm(enumerate(magic_4['alli_q'])):
    tokenized_text = tokenizer.encode_plus(
        text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    ids[i,:] = tokenized_text.input_ids
    masks[i,:] = tokenized_text.attention_mask
  return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(magic_4, X_input_ids, X_attn_masks, tokenizer)

In [None]:
labels = np.zeros((len(magic_4), 4))

# "Cannot predict now": 0
# "Outlook good": 1
# "Signs point to yes": 2
# "Outlook not good": 3

In [None]:
labels.shape

(1000, 4)

In [None]:
labels[np.arange(len(magic_4)), magic_4['answer_4'].values] = 1

In [None]:
labels # checking

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [None]:
dataset.take(1) # more checks

In [None]:
def SentimentDatasetMapFunctions(input_ids, attn_masks, labels):
  return {
      'input_ids': input_ids,
      'attention_mask': attn_masks
  }, labels

In [None]:
dataset = dataset.map(SentimentDatasetMapFunctions)

In [None]:
dataset = dataset.shuffle(10000).batch(8, drop_remainder=True)

In [None]:
# using 80% for training, 20% for validation
train_dataset = dataset.take(int((len(magic_4)//8)*.8))
val_dataset= dataset.skip(int((len(magic_4)//8)*.2))

In [None]:
from transformers import TFBertModel

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = bert_model.bert(input_ids, attention_mask=attention_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output_layer)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                     

In [None]:
# vid doesn't use legacy Adam
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
!mkdir -p saved_model
model.save('saved_model/magic4model')