In [None]:
!pip install tensorflow
!pip install transformers
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
#tqdm is a progress bar
from tqdm import tqdm
from transformers import BertTokenizer

In [None]:
magic_4 = pd.read_csv('Final-Project/data/4answerdata.csv')
magic_4.head()

In [None]:
def c2n(answer):
  if answer=="Cannot predict now":
    return 0
  elif answer=="Outlook good":
    return 1
  elif answer=="Signs point to yes":
    return 2
  else:
    return 3

magic_4['answer_4'] = magic_4['answer_4'].apply(c2n)
magic_4.head()

In [None]:
magic_4.info()

In [None]:
magic_4 = magic_4.sample(frac = 1) # shuffling order
magic_4[:100]

In [None]:
magic_4['answer_4'].value_counts()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
token = tokenizer.encode_plus(
    magic_4['alli_q'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors="tf"
)

In [None]:
X_input_ids = np.zeros((len(magic_4), 256))
X_attn_masks = np.zeros((len(magic_4), 256))

In [None]:
def generate_training_data(magic_4, ids, masks, tokenizer):
  for i, text in tqdm(enumerate(magic_4['alli_q'])):
    tokenized_text = tokenizer.encode_plus(
        text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    ids[i,:] = tokenized_text.input_ids
    masks[i,:] = tokenized_text.attention_mask
  return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(magic_4, X_input_ids, X_attn_masks, tokenizer)

In [None]:
labels = np.zeros((len(magic_4), 4))

# "Cannot predict now": 0
# "Outlook good": 1
# "Signs point to yes": 2
# "Outlook not good": 3

In [None]:
labels[np.arange(len(magic_4)), magic_4['answer_4'].values] = 1

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [None]:
  return {
      'input_ids': input_ids,
      'attention_mask': attn_masks
  }, labels

In [None]:
dataset = dataset.map(SentimentDatasetMapFunctions)

In [None]:
dataset = dataset.shuffle(10000).batch(8, drop_remainder=True)

In [None]:
# using 80% for training, 20% for validation
train_dataset = dataset.take(int((len(magic_4)//8)*.8))
val_dataset= dataset.skip(int((len(magic_4)//8)*.2))

In [None]:
from transformers import TFBertModel

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = bert_model.bert(input_ids, attention_mask=attention_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output_layer)
model.summary()

In [None]:
#using legacy Adam for mac
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

In [None]:
model.save('saved_models/magic4model')