In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df_courses = pd.read_json('./data/processed_data/course_data_clean.json')

### Here's what the data looks like:

In [3]:
df_courses

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked,reviews,num_reviews,num_reviews_with_rating,good_course
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%,[{'review_text': 'go to office hours and pract...,86,83,0
1,MATH 135,Algebra for Honours Mathematics,1186,84%,41%,78%,"[{'review_text': 'Welcome to Waterloo Math.', ...",253,250,1
2,ECON 101,Introduction to Microeconomics,1143,64%,70%,45%,[{'review_text': 'Took it online in W21 during...,214,210,0
3,PSYCH 101,Introductory Psychology,899,73%,67%,79%,"[{'review_text': 'Really easy, the course was ...",8,8,1
4,MATH 137,Calculus 1 for Honours Mathematics,780,86%,56%,69%,"[{'review_text': 'and then isaac newton said ""...",171,167,1
...,...,...,...,...,...,...,...,...,...,...
8474,BE 680,Consulting,0,,,,[],0,0,0
8475,KIN 658,Physical Activity and Cognition,0,,,,[],0,0,0
8476,ERS 620,Skills Identification and Career Development,0,,,,[],0,0,0
8477,KIN 659,Wearable Technology,0,,,,[],0,0,0


### And here's an example of what a review looks like:

In [4]:
df_courses['reviews'].values[0][0]

{'review_text': 'go to office hours and practice',
 'course_rating': 'liked course',
 'course_rating_int': 1}

Quick preprocessing to extract reviews and labels

In [5]:
df_reviews = {'course_code': [], 'review': [], 'label': []}
for course in df_courses[['course_code', 'reviews']].values:
    course_code = course[0]
    reviews = course[1]
    retval = []
    for review in reviews:
        review_text = review['review_text']
        label = review['course_rating_int']
        df_reviews['course_code'].append(course_code)
        df_reviews['review'].append(review_text)
        df_reviews['label'].append(label)
        
df_reviews = pd.DataFrame(df_reviews)

In [6]:
df_reviews

Unnamed: 0,course_code,review,label
0,CS 115,go to office hours and practice,1.0
1,CS 115,One of my least favourite courses. Although th...,0.0
2,CS 115,It starts with a very low pace but after midte...,0.0
3,CS 115,Took this in 2018 with no programming experien...,1.0
4,CS 115,I loved everything about cs 115. Great instruc...,1.0
...,...,...,...
14833,ASL 102R,Took this online with Georgia Whalen. As it wa...,1.0
14834,CS 136L,I either designed or helped design the majorit...,1.0
14835,AFM 417,Topic 3: Intro to Data Analytics,
14836,ECON 472,Not sure if I liked this course yet. Definitel...,


In [14]:
X_train, X_test, y_train, y_test = train_test_split(list(df_reviews['review'].values), list(df_reviews['review'].values), test_size=0.33, random_state=6)

## Training

In [8]:
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForTokenClassification, create_optimizer
from transformers import TFDistilBertForSequenceClassification

In [9]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [16]:
X_train_tokenized = tokenizer(X_train, return_tensors="tf", padding=True, truncation=True)
X_test_tokenized = tokenizer(X_test, return_tensors="tf", padding=True, truncation=True)

train_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_train_tokenized),
    y_train
))

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_test_tokenized),
    y_test
))

In [None]:
# Set hyperparams/constants
model_name = "distilbert-base-uncased"
max_length = 96
epochs = 5
batch_size = 32
learning_rate = 2e-5
weight_decay_rate=0.01

In [None]:
# trying create_optimizer
optimizer, _ = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps
)

In [11]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

2022-09-20 21:48:35.023074: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-20 21:48:35.036253: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model fro

In [45]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=5e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [46]:
model.predict(tokenizer.encode("how are you",
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf"))



TFSequenceClassifierOutput(loss=None, logits=array([[-0.11055946,  0.04798852]], dtype=float32), hidden_states=None, attentions=None)

In [47]:
model.fit(
    train_ds.batch(batch_size),
    epochs=epochs,
    validation_data=test_ds.batch(batch_size)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f52c435a460>