In [1]:
! pip install simpletransformers



In [9]:
import pandas as pd
import numpy as np
import os.path
from os import path
from google.colab import drive

def replace_labels_int(dataset_labels):
  dataset_labels.replace('agree', 0, True)
  dataset_labels.replace('disagree', 1, True)
  dataset_labels.replace('discuss', 2, True)
  dataset_labels.replace('unrelated', 3, True)

def replace_labels_str(dataset_labels):
  dataset_labels.replace(0, 'agree', True)
  dataset_labels.replace(1, 'disagree', True)
  dataset_labels.replace(2, 'discuss', True)
  dataset_labels.replace(3, 'unrelated', True)

drive.mount('/content/gdrive')
DATASET_LOCATION = '/content/gdrive/MyDrive/Colab Notebooks/msci598_final_project_data/'

# Read in train set
if path.exists(DATASET_LOCATION + 'data/train_dataset.csv'):
  train_dataset_raw = pd.read_csv(DATASET_LOCATION + 'data/train_dataset.csv')
else:
  train_dataset_raw_bodies = pd.read_csv(DATASET_LOCATION + 'raw/train_bodies.csv')
  train_dataset_raw_stances = pd.read_csv(DATASET_LOCATION + 'raw/train_stances.csv')
  replace_labels_int(train_dataset_raw_stances)
  train_dataset_raw = train_dataset_raw_stances.join(train_dataset_raw_bodies.set_index('Body ID'), on='Body ID')
  with open(DATASET_LOCATION + 'data/train_dataset.csv', 'w', encoding = 'utf-8-sig') as f:
    train_dataset_raw.to_csv(f)

train_data = pd.DataFrame(columns=[
                            'text_a',
                            'text_b',
                            'labels'
                          ])
train_data['text_a'] = train_dataset_raw['Headline']
train_data['text_b'] = train_dataset_raw['articleBody']
train_data['labels'] = train_dataset_raw['Stance']

# Read in test set
if path.exists(DATASET_LOCATION + 'data/test_dataset.csv'):
  test_dataset_raw = pd.read_csv(DATASET_LOCATION + 'data/test_dataset.csv')
else:
  test_dataset_raw_bodies = pd.read_csv(DATASET_LOCATION + 'raw/competition_test_bodies.csv')
  test_dataset_raw_stances = pd.read_csv(DATASET_LOCATION + 'raw/competition_test_stances.csv')
  replace_labels_int(test_dataset_raw_stances)
  test_dataset_raw = test_dataset_raw_stances.join(test_dataset_raw_bodies.set_index('Body ID'), on='Body ID')
  with open(DATASET_LOCATION + 'data/test_dataset.csv', 'w', encoding = 'utf-8-sig') as f:
    test_dataset_raw.to_csv(f)

test_data = pd.DataFrame(columns=[
                            'text_a',
                            'text_b',
                            'labels'
                         ])
test_data['text_a'] = test_dataset_raw['Headline']
test_data['text_b'] = test_dataset_raw['articleBody']
test_data['labels'] = test_dataset_raw['Stance']


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
# RUN THIS TO TRAIN ROBERTA MODEL
from simpletransformers.classification import ClassificationModel

train_args = {
    'lazy_text_a_column':0,
    'lazy_text_b_column':1,
    'lazy_labels_column':2,
    'learning_rate':4e-5,
    'num_train_epochs': 3,
    'overwrite_output_dir': True,
    'process_count': 8,
    'max_seq_length': 256,
    'early_stopping': True,
}

model_type = 'roberta'

model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=4,
    args=train_args
)

model.train_model(train_data)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/49972 [00:01<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/6247 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/6247 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/6247 [00:00<?, ?it/s]

(18741, 0.2194854107206681)

In [11]:
predicted = list()
model_type = 'roberta'
for i in range(len(test_data)):
  predicted.append([test_data['text_a'][i], test_data['text_b'][i]])

def accuracy(predictions):
  matches = 0
  for i in range(len(predictions)):
    if predictions[i] == test_data['labels'][i]:
      matches += 1
  return matches / len(predictions)

def save_predictions(predictions, model_type):
  preds_answer = pd.DataFrame(columns=[
                               'Headline',
                               'Body ID',
                               'Stance'
                             ])
  preds_answer['Headline'] = test_dataset_raw['Headline']
  preds_answer['Body ID'] = test_dataset_raw['Body ID']
  preds_answer['Stance'] = predictions
  replace_labels_str(preds_answer['Stance'])
  with open(DATASET_LOCATION + 'data/answers/' + model_type + '.csv', 'w', encoding = 'utf-8-sig') as f:
      preds_answer.to_csv(f)

predictions, raw_outputs = model.predict(predicted)

acc = accuracy(predictions)
save_predictions(predictions, model_type)
print("{:s} Model accuracy: {:f}%".format(model_type, acc * 100))


  0%|          | 0/25413 [00:00<?, ?it/s]

  0%|          | 0/3177 [00:00<?, ?it/s]

roberta Model accuracy: 91.425648%
