In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df_courses = pd.read_json('./data/processed_data/course_data_clean.json')

### Here's what the data looks like:

In [None]:
df_courses

### And here's an example of what a review looks like:

In [None]:
df_courses['reviews'].values[0]

Subsetting to get only the course code, review text, and label (which is called "course_rating_int" right now):

In [None]:
df_reviews = df_courses[['course_code', 'reviews', 'course_rating_int']].rename(columns={'course_rating_int': 'label'})

In [None]:
df_reviews

In [None]:
X_train, X_test, y_train, y_test = train_test_split(list(df_reviews['reviews'].values), list(df_reviews['label'].values), test_size=0.33, random_state=6)

## Training

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForTokenClassification, create_optimizer
from transformers import TFDistilBertForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
X_train_tokenized = tokenizer(X_train, return_tensors="tf", padding=True, truncation=True)
X_test_tokenized = tokenizer(X_test, return_tensors="tf", padding=True, truncation=True)

train_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_train_tokenized),
    y_train
))

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_test_tokenized),
    y_test
))

In [None]:
# Set hyperparams/constants
model_name = "distilbert-base-uncased"
max_length = 96
epochs = 5
batch_size = 32
learning_rate = 2e-5
weight_decay_rate=0.01
num_train_steps = len(X_train) // batch_size
num_warmup_steps = 0

In [None]:
# trying create_optimizer
optimizer, _ = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps
)

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
import tensorflow_addons as tfa

metric = tfa.metrics.F1Score(num_classes=2, threshold=0.5)

def f1_m(y_true, y_pred):
    return metric(y_true, y_pred)

model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=5e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[f1_m]
)

In [None]:
model.predict(tokenizer.encode("how are you",
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf"))

In [None]:
model.fit(
    train_ds.batch(batch_size),
    epochs=epochs,
    validation_data=test_ds.batch(batch_size)
)

In [None]:
model.save('./saved_model/tf_distilbert_course_reviews_01.pth')