In [2]:
!pip install jsonlines
!pip install transformers
!pip install tensorflow
!pip install torch

# import libraries
import os
import tensorflow as tf
import torch
import urllib.request, jsonlines
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt
import logging
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime 
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# AWS imports
import boto3, re, sys, math, json, os, sagemaker
from sagemaker import get_execution_role

# Define IAM role
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [39]:
# read in and save train file
try:
  urllib.request.urlretrieve("https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/train.jsonl", "train.jsonl")
  print('Success: downloaded train.jsonl.')
except Exception as e:
  print('Data load error: ',e)

data=[]
with jsonlines.open('./train.jsonl') as reader:
    for obj in reader:
        data.append(obj)

# convert train dataset to pandas dataframe
train = pd.DataFrame.from_dict(data)

# drop context
train = train.drop('context',axis=1)

# encode label as 1 for SARCASM and 0 for NOT_SARCASM
train['label'] = train['label'].apply(lambda x: 1 if x == 'SARCASM' else 0)

# convert the pandas columns to lists
train_text = train['response'].values.tolist()
train_label = train['label'].values.tolist()

# read in and save test file
try:
  urllib.request.urlretrieve ("https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/test.jsonl", "test.jsonl")
  print('Success: downloaded test.jsonl.')
except Exception as e:
  print('Data load error: ',e)

data=[]
with jsonlines.open('./test.jsonl') as reader:
    for obj in reader:
        data.append(obj)

# convert test dataset to pandas dataframe
test = pd.DataFrame.from_dict(data)

# drop context
test = test.drop('context',axis=1)

# convert the pandas column to a list
test_text = test['response'].values.tolist()

Success: downloaded train.jsonl.
Success: downloaded test.jsonl.


In [40]:
# split train into training and validation set: 80/20
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_text , 
                                                                    train_label, 
                                                                    test_size=.2)

# pass our texts to the tokenizer, truncation and padding ensure that all of our sequences are padded 
# to the same length and are truncated to be no longer than the model’s maximum input length
# this will allow us to feed batches of sequences into the model at the same time.
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# create tensor input pipelines for train and validation
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))

# create the pretrained DistilBert model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                              num_labels=2)

# use an Adam optimizer and fit the model
# shuffle = 100, batch size = 16, epochs = 10, learning rate = 5e-5 (all as per literature recommendation)
opt = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
metrics = 'binary_accuracy'
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
model.compile(optimizer=opt, loss=loss, metrics=[metrics])
model.fit(train_dataset.shuffle(100).batch(16), epochs=10, batch_size=16, 
          validation_data=val_dataset.shuffle(100).batch(16), callbacks=[callback])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_419', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/10
Epoch 2/10


<tensorflow.python.keras.callbacks.History at 0x7f3c97d35be0>

In [41]:
# loop over test set, tokenize, predict and use softmax to get probabilities
predictions = np.zeros((len(test_text),2))
for i in range(len(test_text)):
    predict_input = tokenizer.encode(test_text[i], 
                                     truncation=True, padding=True, return_tensors="tf")
    output = model.predict(predict_input)[0]
    predictions[i,:] = tf.nn.softmax(output, axis=1).numpy()[0]

# use a 0.5 threshold to classify as SARCASM or NOT_SARCASM
test['label'] = np.where(predictions[:,1]>0.5, 'SARCASM', 'NOT_SARCASM')

# drop the response column and save according to the specified style
test = test.drop('response',axis=1)
tfile = test.to_csv('answer.csv',header=False, index=False, sep=',', mode='w')