In [85]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [86]:
df1 = pd.read_csv("../../data/GEN-sarc-notsarc.csv")
df2 = pd.read_csv("../../data/HYP-sarc-notsarc.csv")
df3 = pd.read_csv("../../data/RQ-sarc-notsarc.csv")

In [87]:
df = pd.read_csv("../../data/sarcasm.csv")
X = df['text']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

def predict(x_train,y_train,x_test):
    ## train
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(x_train)
    LogisticModel = LogisticRegression()
    LogisticModel.fit(train_tfidf, y_train)
    ## test
    test_tfidf = tfidf.transform(x_test)
    y_pred = LogisticModel.predict(test_tfidf)
    return y_pred

In [89]:
predict(X_train,y_train,X_test)

array(['sarc', 'sarc', 'sarc', ..., 'notsarc', 'notsarc', 'notsarc'],
      dtype=object)

In [90]:
from sklearn.metrics import accuracy_score
y_pred = predict(X_train,y_train,X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7273695420660277

In [91]:
def train(x_train,y_train):
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(x_train)
    LogisticModel = LogisticRegression()
    LogisticModel.fit(train_tfidf, y_train)
    return tfidf, LogisticModel

def test(training_model,tfidf,x_test):
    test_tfidf = tfidf.transform(x_test)
    y_pred = training_model.predict(test_tfidf)
    return y_pred

In [92]:
tfidf, LogisticModel = train(X_train,y_train)
y_pred_2 = test(LogisticModel,tfidf,X_test)
accuracy_score(y_test, y_pred_2)

0.7273695420660277

##   Long Short-Term Memory (LSTM) model

In [93]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Constants for text preprocessing
VOCAB_SIZE = 10000 # number of words to consider from the dataset
MAX_LENGTH = 100    # maximum length of each sentence
TRUNC_TYPE = 'post' # where to truncate sentences
PADDING_TYPE = 'post' # where to pad sentences

data = pd.read_csv("../../data/sarcasm.csv")
# Tokenizing the sentences
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

# Encode the labels
label_encoding = {'notsarc': 0, 'sarc': 1}
labels = data['class'].map(label_encoding).values

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [94]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Model configuration
embedding_dim = 20

# Build the model
model = Sequential([
    Embedding(VOCAB_SIZE, embedding_dim, input_length=MAX_LENGTH),
    LSTM(32),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [95]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [96]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")

Test Accuracy: 0.7145900130271912


In [97]:
# make predictions
def predict_sarcasm(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(sequence, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
    prediction = model.predict(padded)
    return 'Sarcastic' if prediction[0][0] > 0.5 else 'Not Sarcastic'

# Example
predict_sarcasm("Oh, what a wonderful day!")



'Sarcastic'