In [76]:
# Imports
import json
import numpy as np
import pandas as pd

# Load in the data file
with open('../question-data.json') as data_file:
    data = json.loads(data_file.read())

# Prepare the data
td_text = []
td_labels = []
for index in data:
    td_text += [data[index][x]for x in range(len(data[index]))]
    td_labels += [index for x in range(len(data[index]))]
#final = {"text": td_text, "label": td_labels}
str_to_num = {'injury': 0, 'trade': 1}
final = [{"text": td_text[i], "label": str_to_num[td_labels[i]], "idx": i} for i in range(len(td_text))]

In [85]:
from transformers import BertTokenizer, TFBertModel, BertConfig
from sklearn.model_selection import train_test_split

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
input_ids=[]
attention_masks=[]

for sent in td_text:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(td_labels)

train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [77]:
# Writing to a json file
import json
json_object = json.dumps(final, indent=4)
with open("final_data.json", "w") as outfile:
    outfile.write(json_object)

In [78]:
# Load the dataset
from datasets import load_dataset
dataset = load_dataset('json', data_files='final_data.json') #, split=['train[:80%]', 'train[20%:]'])

Using custom data configuration default-743fe2d2f16f0deb


Downloading and preparing dataset json/default to /Users/danielstefanescu/.cache/huggingface/datasets/json/default-743fe2d2f16f0deb/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/danielstefanescu/.cache/huggingface/datasets/json/default-743fe2d2f16f0deb/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [79]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [80]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    batch_size=16
)

# tf_validation_dataset = tokenized_datasets["test"].to_tf_dataset(
#     columns=["attention_mask", "input_ids", "token_type_ids"],
#     label_cols=["labels"],
#     shuffle=False,
#     collate_fn=data_collator,
#     batch_size=8,
# )

ValueError: Label column labels not found in dataset!

In [None]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification,TFBertForSequenceClassification
from transformers import AutoTokenizer
import tensorflow as tf

# Training the model
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2 )

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=5e-5,
    decay_steps=10000,
    decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.metrics.SparseCategoricalAccuracy()
              ) # can also use any keras loss fn
model.fit(tf_train_dataset)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2022-10-20 14:46:00.128626: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




<keras.callbacks.History at 0x6bf041580>

In [None]:
from transformers import TextClassificationPipeline

text = ["is [player] staying at [team]?", "is [player] injured?", "Is [player] staying at [team]?"]
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
pipe(text)

[{'label': 'LABEL_0', 'score': 0.5298250913619995},
 {'label': 'LABEL_0', 'score': 0.5871033072471619},
 {'label': 'LABEL_0', 'score': 0.5464733839035034}]

In [83]:
def extract_predictions(text):
  a_preds = []
  a_scores = []
  data_encodings = tokenizer(text, truncation=True, padding=True)
  ds = tf.data.Dataset.from_tensor_slices(dict(data_encodings)).batch(64)
  preds = model.predict(ds)["logits"]
  classes = np.argmax(preds, axis=1).tolist()
  probs = tf.nn.softmax(preds) # get probabilities from logits
  scores = np.amax(probs, axis=1).tolist()

  a_preds.extend(classes)
  a_scores.extend(scores)
  # return predictions and scores
  ans = []
  num_to_str = {0: 'injury', 1: 'trade'}
  for i in range(len(a_preds)):  # map labels to label titles
    ans.append({
        "label": num_to_str[a_preds[i]],
        "score": a_scores[i]
    })
  ansdf = pd.DataFrame(ans)
  return ansdf

text = ["is [player] staying at [team]?", "is [player] injured?", "Is [player] staying at [team]?"]
result = extract_predictions(text)
print (result)

2022-10-20 15:01:41.289275: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


    label     score
0  injury  0.529825
1  injury  0.587103
2  injury  0.546473
