In [3]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
prefix = 'data/'


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_df = pd.read_csv(prefix + 'train.txt', sep='\t', header=None) 
train_df = train_df[train_df.columns[1:3]]
train_df.columns = ['labels', 'text']
train_df = train_df[['text', 'labels']]

val_df = pd.read_csv(prefix + 'test.txt', sep='\t', header=None)
val_df = val_df[val_df.columns[1:3]]
val_df.columns = ['labels', 'text']
val_df = val_df[['text', 'labels']]

train_df

Unnamed: 0,text,labels
0,It feels like just a few days ago it was the l...,1
1,"I love my mom . No matter what we go through ,...",1
2,Bump that music ... #imtryingtosleep #sarcasm,1
3,Mexican and black jokes are pretty much the sa...,0
4,How to find work you love :,0
...,...,...
51184,RT My EX had one very annoying habit . Breathi...,1
51185,"Some days you're the Titanic , some days you'r...",1
51186,"Congrats on the release of 25 @Adele , let's h...",0
51187,doing my favorite thing .. crying #sarcasm,1


In [11]:
train_df = pd.read_csv(prefix + 'train_MUSTARD', sep='\t', header=None) 
val_df = pd.read_csv(prefix + 'val_MUSTARD', sep='\t', header=None)
train_df

Unnamed: 0,0
0,"Wow. I don't know which hurts worse-- my nose or my heart. Well, I'm done speaking to you. Don't be like that. You two need to talk this out. Yeah, 'cause you sound really funny. Sheldon, I'm sorry I didn't tell you about the surgery, but you were worried about nothing. Oh, you're hardly out of the woods, no. You still run the risk of infection, a blood clot, the possibility that an inattentive surgeon let a barn spider lay eggs in your nose. 0"
1,"When that guy was robbing us and I was locked in the entertainment unit for like six hours, do you know what I was doing there in all that time? I was thinking about how I let you down. Yeah. But if I had known what kind of friend you would turn out to be, I wouldn't have worried about it so much. See you around! We got a box. 0"
2,"consider changing disciplines.\nYeah, to the humanities-- perhaps history. One of the advantages of teaching history is that you don't have to create things. You know, you just have to remember stuff that happened and then parrot it back. You could have fun with that. Yeah, that's not it. Stuart's kind of interested in Amy. Oh, of course he is. She's very interesting. Did you know, when she was 14, she severed the webbing between her own toes? 1"
3,Still worried about the money you owe the government? I'm worried about whether Michael Jackson will be able to buy the remains of the elephant man. 1
4,Hey! That monkey has got a Ross on its ass! 1
...,...
547,"What are you doing? Key is stuck in the lock. I could fix it hold on hold on, watch out watch out. It still doesn't work! I am not finished. Nice jon Joe, you are quite the craftsman! 1"
548,"You liked it? You really liked it? Oh-ho-ho, yeah! Which part exactly? The whole thing! Can we go? 1"
549,"Supposed to study for my French final with a fourteen year old in the house, its hard enough with an eighty year old. Are you referring to me? Ofcourse no Ma, I am referring to Carrey Grant, he is living in the broom closet. 1"
550,"He doesn't need to be sarcastic. I mean, that was sarcasm wasn't it? Really? 1"


In [9]:
import json
import numpy as np
import torch
pd.options.display.max_colwidth = 1000
data = json.load(open('data/sarcasm_max512.json'))
sarcasm_data = [[' '.join([ *d['context'], d['utterance']]), int(d['sarcasm'])] for d in data.values()]
df = pd.DataFrame(sarcasm_data)
df.columns = ['labels', 'text']
train_df = df.sample(frac = 0.8)
val_df = df.drop(train_df.index)
with open('data/train_MUSTARD', 'a') as f:
    dfAsString = train_df.to_string(header=False, index=False)
    f.write(dfAsString)

with open('data/val_MUSTARD', 'a') as f:
    dfAsString = val_df.to_string(header=False, index=False)
    f.write(dfAsString)

In [65]:
pd.Series({c: df[c].map(lambda x: len(str(x))).max() for c in df}).sort_values(ascending =False)
mask = (df['labels'].str.len() > 512)
df[mask]

Unnamed: 0,labels,text


In [2]:
train_df = pd.read_csv('data/train_MUSTARD.csv')  
val_df = pd.read_csv('data/val_MUSTARD.csv')
train_df

Unnamed: 0,labels,text
0,And not the good Amazon with one-day shipping....,1
1,When are you going to stop making Cylon toast?...,1
2,Hi Joey. Hey! How you doin'? He has the most a...,0
3,"Uhh, well I've got an audition down the street...",0
4,Oh you've got to be kidding me. What? As a wed...,0
...,...,...
547,Pickin up wedding dresses. Wapah. Whats wapah ...,0
548,"The audible sigh is a show of exasperation, ri...",1
549,"So you like the drums! That's, that's great! Y...",1
550,What exactly do you think goes on here? Well.....,0


In [7]:

# Optional model configuration

# train_df = pd.read_csv('data/train_MUSTARD.csv')  
# val_df = pd.read_csv('data/val_MUSTARD.csv')

train_df = train_df.head(5)
val_df = val_df.head(5)
model_args = {
    'data_dir': 'data/',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 512,
    'output_mode': 'classification',
    'train_batch_size': 12,
    'eval_batch_size': 12,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_ratio': 0.06,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 50,
    'evaluate_during_training': False,
    'save_steps': 1000,
    'eval_all_checkpoints': True,
    'overwrite_output_dir': False,
    'reprocess_input_data': True,
}



# Create a ClassificationModel
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args, use_cuda=False
)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(val_df)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/5 [00:00<?, ?it/s]

ValueError: too many dimensions 'str'