<a href="https://colab.research.google.com/github/YusufDilekci/Sarcasm_Recognition/blob/main/sarcasm_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg
!pip install pandas
!pip install scikit-learn
!pip install gensim

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
import os
shutil.copy('/content/drive/MyDrive/Sarcasm_Detection/sarcasm.csv', os.getcwd())

'/content/sarcasm.csv'

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("sarcasm.csv")
df.head()

Unnamed: 0,class,text
0,notsarc,"If that's true, then Freedom of Speech is doom..."
1,notsarc,Neener neener - is it time to go in from the p...
2,notsarc,"Just like the plastic gun fear, the armour pie..."
3,notsarc,So geology is a religion because we weren't he...
4,notsarc,Well done Monty. Mark that up as your first ev...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9386 entries, 0 to 9385
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   9386 non-null   object
 1   text    9386 non-null   object
dtypes: object(2)
memory usage: 146.8+ KB


In [None]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
notsarc,4693
sarc,4693


In [None]:
df['class_num'] = df['class'].apply(lambda x : 1 if x == 'sarc' else 0)

In [None]:
df['glove_vector'] = df['text'].apply(lambda x: nlp(x).vector)

In [None]:
def preprocess_and_vectorize(text):
    all_clean_tokens = [token.lemma_.lower() for token in nlp(text) if not token.is_stop and not token.is_punct]
    return wv.get_mean_vector(all_clean_tokens)

df['gensim_vector'] = df['text'].apply(preprocess_and_vectorize)

In [None]:
df.head()

Unnamed: 0,class,text,class_num,glove_vector,gensim_vector
0,notsarc,"If that's true, then Freedom of Speech is doom...",0,"[-0.76626986, 1.4253579, -2.5212822, -1.286170...","[0.028688896, 0.0056534708, 0.007524069, 0.029..."
1,notsarc,Neener neener - is it time to go in from the p...,0,"[-0.41781142, 4.458112, -2.5987816, 0.42482427...","[-0.0014881147, 0.058390122, 0.036367316, 0.06..."
2,notsarc,"Just like the plastic gun fear, the armour pie...",0,"[-1.964692, -0.72769564, -0.93318206, 1.239436...","[0.02869091, 0.042279355, 0.007586536, 0.02459..."
3,notsarc,So geology is a religion because we weren't he...,0,"[-1.9414376, 4.8499994, -2.7247503, -0.0957217...","[0.02096812, -0.0104743205, 0.021920618, 0.038..."
4,notsarc,Well done Monty. Mark that up as your first ev...,0,"[-1.3936415, -2.0412686, -2.7562578, 0.8716953...","[0.0232634, -0.022459248, 0.012941596, 0.03351..."


In [None]:
from sklearn.model_selection import train_test_split

X = df['gensim_vector']
y = df['class_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, f1_score


ml_models = [
    ("DT", DecisionTreeClassifier()),
    ("MNB", MultinomialNB()),
    ("SVM", svm.SVC(kernel='rbf')),
    ("KNN", KNeighborsClassifier(n_neighbors=8)),
    ("RF", RandomForestClassifier(max_depth=2, random_state=0)),
    ("GB", GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=6, random_state=0)),
    ("XGB", XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.01, objective='binary:logistic'))
]

def base_models(models):
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train_2d)
  X_test_scaled = scaler.transform(X_test_2d)

  for name, model in models:
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    f1 = f1_score(y_test, y_pred, average=None)
    print("---------F1 SCORES-----------")
    print(name, "|", f1)

base_models(ml_models)

---------F1 SCORES-----------
DT | [0.60677784 0.61465472]
---------F1 SCORES-----------
MNB | [0.70304302 0.69405405]
---------F1 SCORES-----------
SVM | [0.72008663 0.72917758]
---------F1 SCORES-----------
KNN | [0.70348028 0.60087445]
---------F1 SCORES-----------
RF | [0.69802245 0.70026525]
---------F1 SCORES-----------
GB | [0.70188679 0.70910047]
---------F1 SCORES-----------
XGB | [0.70111881 0.70143693]



# Neural Network Model



In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
vocab_size = len(wv)
embedding_dim = 300
hidden_dim = 128
num_classes = 1
batch_size = 32
num_epochs = 10
learning_rate = 1e-3

In [None]:
X_train = torch.tensor(X_train_2d, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(dim=1)
X_test = torch.tensor(X_test_2d, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(dim=1)

training_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)


In [None]:
class TextClassifierModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(TextClassifierModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits


model = TextClassifierModel(vocab_size, embedding_dim, hidden_dim, num_classes).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
import numpy as np

a = np.array([0.7, 0.3, 0.4]).reshape(-1, 1)
y = np.array([1,1,0]).reshape(-1, 1)
(a > 0.5) == y

array([[ True],
       [False],
       [ True]])

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
  model.train()
  for batch, (X, y) in enumerate(dataloader):
    X, y = X.to(device), y.to(device)

    y_pred = model(X)
    loss = loss_fn(y_pred, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


def test_loop(dataloader, model, loss_fn):
  model.eval()
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0
  with torch.inference_mode():
    for X, y in dataloader:
      X, y = X.to(device), y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred > 0.5 == y).type(torch.float).sum().item()
  test_loss /= num_batches
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


for t in range(num_epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)

print('Done !')

# Transfer Learning, Fine Tuning with Pre-Train Transformers Model

In [None]:
 !pip install transformers
 !pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from datasets import Dataset

label = df['class'].apply(lambda x : 1 if x == 'sarc' else 0)
data = {
    "text": df["text"].tolist(),
    "label": label
}
dff = pd.DataFrame(data)
dataset = Dataset.from_pandas(dff)
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 9386
})


In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

print(tokenized_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9386 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9386
})


In [None]:
from collections import Counter

train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

class_distribution = Counter(train_dataset['label'])
class_distribution

Counter({1: 3749, 0: 3759})

In [None]:
train_test_split

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7508
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1878
    })
})

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    report_to=["none"],
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [None]:
#!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# import wandb

# wandb.init(
#     project="Sarcasm-Recognition",
#     settings=wandb.Settings(init_timeout=480)
# )

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.004167291279166345, max=1.0…

CommError: Run initialization has timed out after 240.0 sec. Please try increasing the timeout with the `init_timeout` setting: `wandb.init(settings=wandb.Settings(init_timeout=120))`.

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    device=device
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5493,0.463043
2,0.3495,0.624342
3,0.2385,0.837063


TrainOutput(global_step=2817, training_loss=0.35952810112110556, metrics={'train_runtime': 2281.8974, 'train_samples_per_second': 9.871, 'train_steps_per_second': 1.234, 'total_flos': 5926313410928640.0, 'train_loss': 0.35952810112110556, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.837062656879425, 'eval_runtime': 53.0785, 'eval_samples_per_second': 35.382, 'eval_steps_per_second': 4.427, 'epoch': 3.0}


# Load a Custom Model From Hugging Face Hub

In [None]:
from transformers import pipeline

MODEL_PATH = "helinivan/english-sarcasm-detector"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
inference_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)


test_texts = train_test_split["test"]["text"]
predictions = inference_pipeline(test_texts)


In [None]:

df_result = pd.DataFrame(zip(test_texts, predictions), columns=['TEXT', 'PREDICTION'])
df_result.head()

Unnamed: 0,TEXT,PREDICTION
0,Except that the word for day in Genesis 1 is '...,"{'label': 'LABEL_0', 'score': 0.9586531519889832}"
1,And people say I make strawman arguments.Simpl...,"{'label': 'LABEL_0', 'score': 0.9902228116989136}"
2,"Man, these guys can't even get into the scienc...","{'label': 'LABEL_0', 'score': 0.709625780582428}"
3,No it means that the genome operates in a more...,"{'label': 'LABEL_0', 'score': 0.9932511448860168}"
4,Wouldn't it be wonderful if every woman in the...,"{'label': 'LABEL_0', 'score': 0.9954440593719482}"
