In [25]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string

In [26]:
df = pd.read_csv('sarcasm_data.csv')

In [27]:
df.head()

Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,1_60,It's just a privilege to watch your mind at work.,SHELDON,I never would have identified the fingerprints...,LEONARD|SHELDON,BBT,True
1,1_70,I don't think I'll be able to stop thinking ab...,PENNY,This is one of my favorite places to kick back...,HOWARD|PENNY|HOWARD|HOWARD|HOWARD|PENNY|HOWARD,BBT,True
2,1_80,"Since it's not bee season, you can have my epi...",SHELDON,"Here we go. Pad thai, no peanuts.|But does it ...",LEONARD|HOWARD|LEONARD,BBT,False
3,1_90,"Lois Lane is falling, accelerating at an initi...",SHELDON,A marathon? How many Superman movies are there...,PENNY|SHELDON|PENNY|SHELDON|SHELDON|PENNY|SHELDON,BBT,False
4,1_105,I'm just inferring this is a couch because the...,SHELDON,"Great Caesar's ghost, look at this place.|So P...",SHELDON|LEONARD|SHELDON|SHELDON|SHELDON|SHELDON,BBT,True


In [28]:
# For Columns -> Utterances & Speaker
def clean_text_cols12(text):
    # Convert all text to lowercase
    text = text.lower()
   
    # Remove all punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove Apostrophe & Full-Stop
    for word in text:
        if "'" in word:
            word = word.replace("'", "")
        if "." in word:
            word = word.replace(".", "")
    
    # Remove all digits
    text = re.sub(r"\d+", "", text)
   
    # Remove all extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
   
    # Remove all stop words (optional)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
   
    return text

# | Stop Word | Punctuation | Lowercase | 
df['utterance'] = df['utterance'].apply(clean_text_cols12)
df['speaker'] = df['speaker'].apply(clean_text_cols12)

In [29]:
df.head()

Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,1_60,privilege watch mind work,sheldon,I never would have identified the fingerprints...,LEONARD|SHELDON,BBT,True
1,1_70,dont think ill able stop thinking,penny,This is one of my favorite places to kick back...,HOWARD|PENNY|HOWARD|HOWARD|HOWARD|PENNY|HOWARD,BBT,True
2,1_80,since bee season epinephrine,sheldon,"Here we go. Pad thai, no peanuts.|But does it ...",LEONARD|HOWARD|LEONARD,BBT,False
3,1_90,lois lane falling accelerating initial rate fe...,sheldon,A marathon? How many Superman movies are there...,PENNY|SHELDON|PENNY|SHELDON|SHELDON|PENNY|SHELDON,BBT,False
4,1_105,im inferring couch evidence suggests coffee ta...,sheldon,"Great Caesar's ghost, look at this place.|So P...",SHELDON|LEONARD|SHELDON|SHELDON|SHELDON|SHELDON,BBT,True


In [30]:
from nltk.tokenize import word_tokenize

In [31]:
# df['utterance'] = df['utterance'].apply(lambda x: word_tokenize(x))

# Only for column -> context_speakers
def tokenize_context_speakers(value):
    # To Lower Case
    value = value.lower()
   
    # Split by '|''
    tokenized_value = value.split("|")
    return tokenized_value
   
df['context_speakers'] = df['context_speakers'].apply(tokenize_context_speakers)

# Only for column -> context_speakers
def tokenize_context(value):
    # To Lower Case
    value = value.lower()
    # Split by '|''
    tokenized_value = value.split("|")
    return tokenized_value
   
df['context'] = df['context'].apply(tokenize_context)


In [32]:
def one_hot_encoding(value):
    if value == False:
        return 0
    else:
        return 1
df['sarcasm'] = df['sarcasm'].apply(one_hot_encoding)

In [33]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform PRODUCT_TYPE_ID column
df['show'] = le.fit_transform(df['show'])

In [34]:
df.head()

Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,1_60,privilege watch mind work,sheldon,[i never would have identified the fingerprint...,"[leonard, sheldon]",0,1
1,1_70,dont think ill able stop thinking,penny,[this is one of my favorite places to kick bac...,"[howard, penny, howard, howard, howard, penny,...",0,1
2,1_80,since bee season epinephrine,sheldon,"[here we go. pad thai, no peanuts., but does i...","[leonard, howard, leonard]",0,0
3,1_90,lois lane falling accelerating initial rate fe...,sheldon,[a marathon? how many superman movies are ther...,"[penny, sheldon, penny, sheldon, sheldon, penn...",0,0
4,1_105,im inferring couch evidence suggests coffee ta...,sheldon,"[great caesar's ghost, look at this place., so...","[sheldon, leonard, sheldon, sheldon, sheldon, ...",0,1


In [65]:
# from nltk.stem import PorterStemmer

# # initialize Porter stemmer
# stemmer = PorterStemmer()

# # define a function to apply stemming to a text
# def apply_stemming(words):
#     # apply stemming to each word
#     stemmed_words = [stemmer.stem(word) for word in words]
#     # join stemmed words into a single string
#     stemmed_text = " ".join(stemmed_words)
#     return stemmed_text

# df['utterance'] = df['utterance'].apply(apply_stemming)
# df['context'] = df['context'].apply(apply_stemming)

# df['utterance'] = df['utterance'].apply(lambda x: word_tokenize(x))

In [35]:
# Lemmatization

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# define function to map POS tag to WordNet POS tag
def get_wordnet_pos(word):
    """Map POS tag to WordNet POS tag"""
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# define function to perform lemmatization on a sentence of CONTEXT
def lemmatization(cell):
    
    lemma_result = []
    
    for sentence in cell:
        
        # tokenize each sentence in cell
        tokens = sentence.split(" ")
        
        # lemmatize each tokens
        lemmatized_list_of_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
   
        # append to lemma_result -> list of list
        lemma_result.append(lemmatized_list_of_tokens)
   
    return lemma_result

# define function to perform lemmatization on a sentence of UTTERANCES
def lemmatization_col1(sentence):

    # tokenize each sentence in cell
    tokens = sentence.split(" ")

    # lemmatize each tokens
    lemmatized_list_of_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
   
    return lemmatized_list_of_tokens

In [36]:
# Apply lemmatization on column -> context
df['context'] = df['context'].apply(lemmatization)

# Apply lemmatization on column -> utterance
df['utterance'] = df['utterance'].apply(lemmatization_col1)

In [37]:
df.head()

Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,1_60,"[privilege, watch, mind, work]",sheldon,"[[i, never, would, have, identify, the, finger...","[leonard, sheldon]",0,1
1,1_70,"[dont, think, ill, able, stop, think]",penny,"[[this, be, one, of, my, favorite, place, to, ...","[howard, penny, howard, howard, howard, penny,...",0,1
2,1_80,"[since, bee, season, epinephrine]",sheldon,"[[here, we, go., pad, thai,, no, peanuts.], [b...","[leonard, howard, leonard]",0,0
3,1_90,"[lois, lane, fall, accelerate, initial, rate, ...",sheldon,"[[a, marathon?, how, many, superman, movie, be...","[penny, sheldon, penny, sheldon, sheldon, penn...",0,0
4,1_105,"[im, infer, couch, evidence, suggests, coffee,...",sheldon,"[[great, caesar's, ghost,, look, at, this, pla...","[sheldon, leonard, sheldon, sheldon, sheldon, ...",0,1


In [39]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
     ---------------------------------------- 7.0/7.0 MB 613.6 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
     ------------------------------------ 224.2/224.2 kB 807.3 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 889.4 kB/s eta 0:00:00
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.0 tokenizers-0.13.3 transformers-4.28.1



[notice] A new release of pip is available: 23.1 -> 23.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define custom dataset class
class SarcasmDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        utterance = self.data['utterance'][index]
        label = self.data[[0, 1]].iloc[index].tolist()
        
        # Tokenize input text
        inputs = tokenizer.encode_plus(
            utterance,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            'token_type_ids': inputs['token_type_ids'][0],
            'label': torch.tensor(label, dtype=torch.float)
        }

# Define data loaders
train_dataset = SarcasmDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = SarcasmDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define BERT model and optimizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train model
model.train()
for epoch in range(5):
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        label = batch['label']
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label)
        
        loss = outputs.loss
        logits = outputs.logits
        
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1} Loss: {loss.item()}')
    
# Evaluate model
model.eval()
total_correct = 0
total_samples = 0
for batch in test_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    token_type_ids = batch['token_type_ids']
    label = batch['label']
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    correct = torch.sum(preds == torch.argmax(label, dim=1))
    
    total_correct += correct.item()
    total_samples += len(preds)
    
accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy}')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]


KeyboardInterrupt

