In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df_courses = pd.read_json('./data/processed_data/course_data_clean.json')

### Here's what the data looks like:

In [10]:
df_courses

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked,reviews,num_reviews,num_reviews_with_rating,good_course
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%,[{'review_text': 'go to office hours and pract...,86,83,0
1,MATH 135,Algebra for Honours Mathematics,1186,84%,41%,78%,"[{'review_text': 'Welcome to Waterloo Math.', ...",253,250,1
2,ECON 101,Introduction to Microeconomics,1143,64%,70%,45%,[{'review_text': 'Took it online in W21 during...,214,210,0
3,PSYCH 101,Introductory Psychology,899,73%,67%,79%,"[{'review_text': 'Really easy, the course was ...",8,8,1
4,MATH 137,Calculus 1 for Honours Mathematics,780,86%,56%,69%,"[{'review_text': 'and then isaac newton said ""...",171,167,1
...,...,...,...,...,...,...,...,...,...,...
8474,BE 680,Consulting,0,,,,[],0,0,0
8475,KIN 658,Physical Activity and Cognition,0,,,,[],0,0,0
8476,ERS 620,Skills Identification and Career Development,0,,,,[],0,0,0
8477,KIN 659,Wearable Technology,0,,,,[],0,0,0


### And here's an example of what a review looks like:

In [11]:
df_courses['reviews'].values[0][0]

{'review_text': 'go to office hours and practice',
 'course_rating': 'liked course',
 'course_rating_int': 1}

Quick preprocessing to extract reviews and labels

In [None]:
df_reviews = {'course_code': [], 'review': [], 'label': []}
for course in df_courses[['course_code', 'reviews']].values:
    course_code = course[0]
    reviews = course[1]
    retval = []
    for review in reviews:
        review_text = review['review_text']
        label = review['course_rating_int']
        df_reviews['course_code'].append(course_code)
        df_reviews['review'].append(review_text)
        df_reviews['label'].append(label)
        
df_reviews = pd.DataFrame(df_reviews)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(list(df_reviews['review'].values), list(df_reviews['review'].values), test_size=0.33, random_state=6)

## HF Trainer

In [13]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from torch import nn
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm
2022-09-19 22:31:23.609181: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-19 22:31:23.753520: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-19 22:31:24.356353: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-09-19 22:31:24.356452: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic l

In [14]:
dataset = {
    'train': Dataset.from_dict({'label':y_train,'text':X_train}),
    'test': Dataset.from_dict({'label':y_test,'text':X_test})
}

dataset = DatasetDict(dataset)

In [15]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
def tokenize_text(texts):
    return tokenizer(texts["text"], truncation=True, max_length=512)

In [17]:
dataset = dataset.map(tokenize_text, batched=True)

100%|███████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 41.38ba/s]
100%|███████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 54.44ba/s]


In [18]:
id2label = {0: "bad course", 1: "good course"}
label2id = {"bad course": 0, "good course": 1}

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [20]:
from sklearn.metrics import f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

In [21]:
batch_size = 42
logging_steps = len(X_train) // batch_size
output_dir = "hf_trainer"
training_args = TrainingArguments(
    output_dir=output_dir,
     num_train_epochs=5,
     learning_rate=2e-5,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     evaluation_strategy="epoch",
     logging_steps=logging_steps,
     #p16=True,
     push_to_hub=False
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer
)

In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4318
  Num Epochs = 5
  Instantaneous batch size per device = 42
  Total train batch size (w. parallel, distributed & accumulation) = 84
  Gradient Accumulation steps = 1
  Total optimization steps = 260
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.629812,0.547225
2,0.636900,0.629814,0.547225
3,0.636900,0.632243,0.547225
4,0.620400,0.636822,0.547225
5,0.620400,0.643361,0.546545


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2127
  Batch size = 84
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2127
  Batch size = 84
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=260, training_loss=0.6207903495201698, metrics={'train_runtime': 96.6586, 'train_samples_per_second': 223.363, 'train_steps_per_second': 2.69, 'total_flos': 1498900180289976.0, 'train_loss': 0.6207903495201698, 'epoch': 5.0})