# Overview

This notebook is used to train a sentiment model on Hostelworld reviews. The trained model was push to the HuggingFace model hub.

The code from the following HuggingFace tutorial was used to train the model:
https://huggingface.co/docs/transformers/tasks/sequence_classification

# Import Packages

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, create_optimizer, TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback
from datasets import load_dataset, Dataset
from transformers.keras_callbacks import PushToHubCallback
from huggingface_hub import notebook_login
import evaluate

# Read In and Prepare Data

In [None]:
data = pd.read_csv('data/message_df_labelled.csv')
data = data.loc[data['rating'].notnull()]
data = data[['split_text', 'rating']]
data = data.rename(columns={"split_text":"text", "rating":"label"})
data.loc[(data['label']=="MIXED")|(data['label']=="NEUTRAL"), "label"] = 'OTHER'

In [None]:
df = Dataset.from_pandas(data)
df = df.class_encode_column("label")
df = df.train_test_split(test_size=0.2, stratify_by_column="label")
# df[0]
# df['text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_data = df.map(preprocess_function, batched=True)
tokenized_data

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Prepare Model and Train

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "OTHER", 2:"POSITIVE"}
label2id = {"NEGATIVE": 0, "OTHER": 1, "POSITIVE":2}

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

model.compile(optimizer=optimizer)

In [None]:
notebook_login()

In [None]:
push_to_hub_callback = PushToHubCallback(
    output_dir="hostel-reviews-sentiment-model",
    tokenizer=tokenizer,
)
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, callbacks=callbacks)

In [None]:
# if we don't push to hub can use the below to save and load the model
# model.save_pretrained('hostelworld_sentiment_model')
# test_model = TFAutoModelForSequenceClassification.from_pretrained("hostelworld_sentiment_model")

# Predict on One Sample

In [None]:
test_model = TFAutoModelForSequenceClassification.from_pretrained("atowey01/hostel-reviews-sentiment-model")

In [None]:
text = "Amazing hostel"

In [None]:
inputs = tokenizer(text, return_tensors="tf")
logits = model(**inputs).logits
predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
print(model.config.id2label[predicted_class_id])
print(float(tf.reduce_max(tf.nn.softmax(logits), axis=-1)))