In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df_courses = pd.read_json('./data/course_sample_overviews.json')

In [3]:
df_courses

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%
1,ECON 101,Introduction to Microeconomics,1142,64%,70%,45%
2,MATH 137,Calculus 1 for Honours Mathematics,776,86%,56%,68%
3,MATH 115,Linear Algebra for Engineering,709,83%,42%,67%
4,MATH 136,Linear Algebra 1 for Honours Mathematics,670,79%,41%,59%
...,...,...,...,...,...,...
4235,HIST 703,The History of Global Governance,0,,,
4236,ECE 763,Sustainable Distributed Power Generation,0,,,
4237,BE 680,Consulting,0,,,
4238,ERS 620,Skills Identification and Career Development,0,,,


## Initial preprocessing

- mostly stuff that will be deleted later -- there will be updates to the webscraping scripts so that this is unnecessary

---

In [4]:
with open('./data/courses_sample.json', "r") as f:
    reviews = json.load(f)

In [5]:
def get_reviews(row):
    course_code = row[0]
    revs = reviews[course_code]
    revs = [rev['review'] for rev in revs]
    return np.array(revs)

In [6]:
df_courses['reviews'] = df_courses.apply(get_reviews, axis=1)

In [7]:
df_courses

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked,reviews
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%,"[go to office hours and practice, One of my le..."
1,ECON 101,Introduction to Microeconomics,1142,64%,70%,45%,"[Took it online in W21 during COVID, concepts ..."
2,MATH 137,Calculus 1 for Honours Mathematics,776,86%,56%,68%,"[and then isaac newton said ""it's calculating ..."
3,MATH 115,Linear Algebra for Engineering,709,83%,42%,67%,[you really have to do assignments and practic...
4,MATH 136,Linear Algebra 1 for Honours Mathematics,670,79%,41%,59%,"[Interesting course, just don't take it with F..."
...,...,...,...,...,...,...,...
4235,HIST 703,The History of Global Governance,0,,,,[]
4236,ECE 763,Sustainable Distributed Power Generation,0,,,,[]
4237,BE 680,Consulting,0,,,,[]
4238,ERS 620,Skills Identification and Career Development,0,,,,[]


In [8]:
def good_course(row, col_name='liked', threshold=50):
    percent = row[col_name]
    percent = percent.replace('%', '')
    if not percent.isnumeric():
        # If no rating, return 0 (not a good course)
        return 0
    percent = int(percent)
    if percent >  threshold:
        return 1
    return 0

In [9]:
df_courses.apply(good_course, axis=1)

0       0
1       0
2       1
3       1
4       1
       ..
4235    0
4236    0
4237    0
4238    0
4239    0
Length: 4240, dtype: int64

In [10]:
df_courses['num_reviews'] = df_courses.apply(lambda x: len(x['reviews']), axis=1)

## Temporary:

Need to retrieve rating from each specific review somehow; not immediately clear how to yet baesd on website layout, but for now just generating fake data to show how testing will work

In [11]:
def tmp(row):
    num_reviews = row['num_reviews']
    review_likes = np.random.choice([0, 1], size=(num_reviews,), p=[1./3, 2./3])
    return review_likes

In [12]:
df_courses['review_likes'] = df_courses.apply(tmp, axis=1)

In [13]:
np.random.choice([0, 1], size=(10,), p=[1./3, 2./3])

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1])

In [14]:
df_courses['good_course'] = df_courses.apply(good_course, axis=1)

In [15]:
df_courses

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked,reviews,num_reviews,review_likes,good_course
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%,"[go to office hours and practice, One of my le...",86,"[1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, ...",0
1,ECON 101,Introduction to Microeconomics,1142,64%,70%,45%,"[Took it online in W21 during COVID, concepts ...",214,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...",0
2,MATH 137,Calculus 1 for Honours Mathematics,776,86%,56%,68%,"[and then isaac newton said ""it's calculating ...",171,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, ...",1
3,MATH 115,Linear Algebra for Engineering,709,83%,42%,67%,[you really have to do assignments and practic...,136,"[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...",1
4,MATH 136,Linear Algebra 1 for Honours Mathematics,670,79%,41%,59%,"[Interesting course, just don't take it with F...",132,"[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, ...",1
...,...,...,...,...,...,...,...,...,...,...
4235,HIST 703,The History of Global Governance,0,,,,[],0,[],0
4236,ECE 763,Sustainable Distributed Power Generation,0,,,,[],0,[],0
4237,BE 680,Consulting,0,,,,[],0,[],0
4238,ERS 620,Skills Identification and Career Development,0,,,,[],0,[],0


## Completed initial preprocessing!

---

In [16]:
subset = df_courses.loc[(df_courses['num_ratings'] >= 5) & (df_courses['num_reviews'] >= 5)]

In [17]:
subset

Unnamed: 0,course_code,course_title,num_ratings,useful,easy,liked,reviews,num_reviews,review_likes,good_course
0,CS 115,Introduction to Computer Science 1,2111,21%,10%,23%,"[go to office hours and practice, One of my le...",86,"[1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, ...",0
1,ECON 101,Introduction to Microeconomics,1142,64%,70%,45%,"[Took it online in W21 during COVID, concepts ...",214,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...",0
2,MATH 137,Calculus 1 for Honours Mathematics,776,86%,56%,68%,"[and then isaac newton said ""it's calculating ...",171,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, ...",1
3,MATH 115,Linear Algebra for Engineering,709,83%,42%,67%,[you really have to do assignments and practic...,136,"[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...",1
4,MATH 136,Linear Algebra 1 for Honours Mathematics,670,79%,41%,59%,"[Interesting course, just don't take it with F...",132,"[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, ...",1
...,...,...,...,...,...,...,...,...,...,...
539,ME 548,Numerical Control of Machine Tools 1,9,81%,78%,89%,[Kaan Erkorkmaaz and his teaching team are ama...,5,"[1, 1, 1, 1, 0]",1
541,MTE 544,Autonomous Mobile Robots,9,94%,6%,67%,[This course has been reworked as of Fall 2021...,5,"[1, 0, 1, 0, 1]",1
623,REC 251,Therapeutic Recreation: Developmental and Emot...,7,57%,46%,57%,"[I like the content for sure, it is informativ...",5,"[1, 1, 0, 1, 1]",1
670,MTE 545,Introduction to MEMS Fabrication,6,95%,40%,67%,[Took this one online during COVID. Kind of li...,5,"[0, 1, 0, 1, 1]",1


In [18]:
subset['good_course'].sum() / len(subset)

0.7781065088757396

In [19]:
likes = list(np.concatenate(subset['review_likes'].values).flat)
oh_likes = np.zeros((len(likes), 2))
oh_likes[np.arange(len(likes)),likes] = 1
X_train, X_test, y_train, y_test = train_test_split(list(np.concatenate(subset['reviews'].values).flat), oh_likes, test_size=0.33, random_state=6)

### Ensure that test set is representative of training set

In [20]:
sum(y_train) / len(y_train), sum(y_test) / len(y_test)

(array([0.33997221, 0.66002779]), array([0.33850494, 0.66149506]))

## Training

In [1]:
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForTokenClassification, create_optimizer
from transformers import TFDistilBertForSequenceClassification

ModuleNotFoundError: No module named 'tensorflow.python.keras'

In [21]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

NameError: name 'AutoTokenizer' is not defined

In [None]:
def batch_encode(tokenizer, texts, batch_size=96, max_length=128):
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(
            batch,
             max_length=max_length,
             padding='longest',
             truncation=True,
             return_attention_mask=True,
             return_token_type_ids=False
        )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [None]:
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train)
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test)

In [None]:
y_train.shape

In [None]:
X_train_tokenized = tokenizer(X_train, return_tensors="tf", padding=True, truncation=True)
X_test_tokenized = tokenizer(X_test, return_tensors="tf", padding=True, truncation=True)

train_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_train_tokenized),
    y_train
))

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(X_test_tokenized),
    y_test
))

In [None]:
# Set hyperparams/constants
model_name = "distilbert-base-uncased"
max_length = 96
epochs = 5
batch_size = 32
learning_rate = 2e-5
weight_decay_rate=0.01
num_warmup_steps = 0
num_train_steps = len(X_train) * epochs

In [None]:
# trying create_optimizer
optimizer, _ = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps
)

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
tokenizer

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=2e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [None]:
model.predict(tokenizer.encode("how are you",
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf"))

In [None]:
model.fit(
    train_ds.batch(batch_size),
    epochs=epochs,
    steps_per_epoch=num_train_steps,
    batch_size=batch_size,
    validation_data=test_ds.batch(batch_size)
)

ERROR! Session/line number was not unique in database. History logging moved to new session 677
