In [1]:
import pandas as pd

# Use BERT for embeddings

In [35]:
cleaned_df = pd.read_csv('../data/sarcasm_dataset.csv')

In [4]:
cleaned_df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [5]:
!pip install transformers
!pip install torch



In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
sample = cleaned_df['headline'][0]
encoding = tokenizer.encode(sample)
print("Sample:", sample)
print(encoding)
print(tokenizer.convert_ids_to_tokens(encoding))

Sample: thirtysomething scientists unveil doomsday clock of hair loss
[101, 4228, 14045, 20744, 6529, 4895, 3726, 4014, 12677, 16150, 4710, 5119, 1997, 2606, 3279, 102]
['[CLS]', 'thirty', '##some', '##thing', 'scientists', 'un', '##ve', '##il', 'doom', '##sd', '##ay', 'clock', 'of', 'hair', 'loss', '[SEP]']


In [11]:
!pip install tqdm



In [12]:
import tqdm

def create_input_features(tokenizer, sentences, max_len):
    input_ids = []
    attention_masks = []
    for sentence in tqdm.tqdm(sentences, desc="Converting docs to features"):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
    return input_ids, attention_masks

In [17]:
# input_ids, attn_masks = create_input_features(tokenizer, cleaned_df['headline'], 15)

Converting docs to features: 100%|██████████| 28503/28503 [00:05<00:00, 5326.13it/s]


# train_test_split

In [20]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.5.0


In [21]:
from sklearn.model_selection import train_test_split

X = cleaned_df.drop(columns=['is_sarcastic'])
y = cleaned_df['is_sarcastic']


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((19952, 1), (8551, 1))

In [27]:
# save the train and test data
X_train.to_csv('../data/train.csv', index=False)
X_test.to_csv('../data/test.csv', index=False)

# save the labels
y_train.to_csv('../data/train_labels.csv', index=False)
y_test.to_csv('../data/test_labels.csv', index=False)

In [28]:
input_ids, attn_masks = create_input_features(tokenizer, X_train['headline'], 15)

Converting docs to features: 100%|██████████| 19952/19952 [00:03<00:00, 5382.77it/s]


In [31]:
import torch

torch.save(input_ids, '../data/embeddings/bert_input_ids')
torch.save(attn_masks, '../data/embeddings/bert_attention_masks')

In [None]:
torch.load('../data/embeddings/bert_attention_masks')