In [3]:
#the following notebook was written to tune a BERT model to classify text
#and is only intended to work when executed in a kaggle notebook
#the competition is https://www.kaggle.com/competitions/nlp-getting-started

#this adapts the approach of the huggingface classify text tutorial (https://huggingface.co/docs/transformers/tasks/sequence_classification)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#import torch
import tensorflow as tf
import seaborn as sns

!pip install transformers datasets evaluate
from transformers import BertModel, AutoModel, AutoTokenizer, BertTokenizer

from datasets import Dataset
import evaluate

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━

In [None]:
#import data and make a train-test split
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

#rename target to 'label'
train = train.rename(columns={'target':'label'})

from sklearn.model_selection import train_test_split
train_t, train_v = train_test_split(train,test_size = 0.2, random_state=43)

In [1]:
#the following is preprocessing - it takes pandas dataframes to huggingface datasets
#which are converted to tensorflow datasets for tuning the model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

#preproc function
def preproc_func(examples):
    return tokenizer(examples["text"], truncation=True)

def toTok_ds(df):
    #it seems that the keyword and location data might have some significance when
    #determining whether a tweet is really about a disaster or not, so I put them
    #at the start of the text input
    df['fullText'] = df['keyword'].astype(str)+' '+df['location'].astype(str)+' '+df['text']
    
    #drop all columns beside fulltext and rename fulltext to full text
    #without this step the dataset is not parsed correctly by the keras metric callback for
    #some reason
    for col in df.columns:
        if col !='fullText' and col !='label':
            df = df.drop(columns=[col])
    
    df = df.rename(columns = {'fullText':'text'})
    
    ds = Dataset.from_pandas(df)
    tokenized_ds = ds.map(preproc_func, batched=True)
    return tokenized_ds

tok_t = toTok_ds(train_t)
tok_v = toTok_ds(train_v)

tok_t = tok_t.remove_columns(["__index_level_0__"])
tok_v = tok_v.remove_columns(["__index_level_0__"])

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

#final tensorflow datasets that get used to tune the model
tf_train_set = model.prepare_tf_dataset(
    tok_t,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tok_v,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


NameError: name 'BertTokenizer' is not defined

In [None]:
#here, we set up a callback that computes accuracy

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis = 1)
    return accuracy.compute(predictions=predictions, references = labels)

from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

callbacks = [metric_callback]



In [None]:
#now, we load the model and tune it with the prepared tensorflow datasets

In [None]:
id2label = {0: "FALSE", 1: "TRUE"}
label2id = {"FALSE": 0, "TRUE": 1}

In [None]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tok_t) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

In [None]:
#now that the model is trained, we want to apply it to the test data to classify as of yet unlabeled tweets

In [None]:
test['fullText'] = test['keyword'].astype(str)+' '+test['location'].astype(str)+' '+test['text']

In [None]:
output = []

for i in range(len(test)):
    output.append(classifier(test['fullText'][i])[0]['label'])

In [None]:
def toTarget(label):
    if label == 'TRUE':
        return 1
    else:
        return 0

final_targets = []
for label in output:
    final_targets.append(toTarget(label))
    
test['target'] = final_targets

In [None]:
test.to_csv('output.csv')
#note that for a kaggle submission the output will have more columns than it should and an extra header line