In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive/


Mounted at /content/drive
BertDemo.ipynb	       BiLSTMClassification.ipynb  Image_Downloader  Resume.pdf
BiLSTM_BERT_Train.csv  glove.6B.300d.txt.zip	   MyModels


In [8]:
!mkdir -p /content/drive/MyDrive/


In [9]:
!unzip /content/drive/MyDrive/glove.6B.300d.txt.zip -d /content/drive/MyDrive/


Archive:  /content/drive/MyDrive/glove.6B.300d.txt.zip
  inflating: /content/drive/MyDrive/glove.6B.300d.txt  


In [None]:
#You must have data set and Glove pretrained Glove Embedding matrxi in given path

In [12]:


import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
import string

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create directory for saving models
!mkdir -p /content/drive/MyDrive/MyModels/

# Load your dataset
data = pd.read_csv("/content/drive/MyDrive/BiLSTM_BERT_Train.csv")
data['tweet'] = data['tweet'].astype(str)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tweet'])
X = tokenizer.texts_to_sequences(data['tweet'])
X = pad_sequences(X, maxlen=100)
y = data['sarcastic']

# Oversample the minority class
data_majority = data[data.sarcastic == 0]
data_minority = data[data.sarcastic == 1]
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority), random_state=42)
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

# Split the data into training and testing sets
X_upsampled = tokenizer.texts_to_sequences(data_upsampled['tweet'])
X_upsampled = pad_sequences(X_upsampled, maxlen=100)
y_upsampled = data_upsampled['sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

# Load pre-trained GloVe word vectors
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
glove_path = "/content/drive/MyDrive/glove.6B.300d.txt"
embeddings_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

# Create an embedding matrix for the Embedding layer
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build the LSTM model with GloVe embeddings
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                    input_length=100, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with Adam optimizer
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Implement early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, mode='max')
model_checkpoint = ModelCheckpoint("/content/drive/MyDrive/MyModels/sarcasm_detection_best_model.h5", save_best_only=True)

# Train the model and calculate metrics for each epoch
for iteration in range(5):
    history = model.fit(X_train, y_train, validation_split=0.2,
                        epochs=3, batch_size=64, callbacks=[early_stopping, model_checkpoint], verbose=2)

# Evaluate the model on the testing data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Calculate final metrics for the best model
final_precision = precision_score(y_test, y_pred)
final_recall = recall_score(y_test, y_pred)
final_f1 = f1_score(y_test, y_pred)

print(f'Final Precision = {final_precision:.4f}, Recall = {final_recall:.4f}, F1-score = {final_f1:.4f}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/3
52/52 - 10s - loss: 0.6894 - accuracy: 0.5352 - val_loss: 0.6658 - val_accuracy: 0.6471 - 10s/epoch - 188ms/step
Epoch 2/3


  saving_api.save_model(


52/52 - 1s - loss: 0.6342 - accuracy: 0.6445 - val_loss: 0.6096 - val_accuracy: 0.6687 - 1s/epoch - 21ms/step
Epoch 3/3
52/52 - 1s - loss: 0.5259 - accuracy: 0.7395 - val_loss: 0.5689 - val_accuracy: 0.7059 - 1s/epoch - 22ms/step
Epoch 1/3
52/52 - 2s - loss: 0.3957 - accuracy: 0.8320 - val_loss: 0.5438 - val_accuracy: 0.7455 - 2s/epoch - 35ms/step
Epoch 2/3
52/52 - 1s - loss: 0.2575 - accuracy: 0.8978 - val_loss: 0.6280 - val_accuracy: 0.7611 - 966ms/epoch - 19ms/step
Epoch 3/3
52/52 - 1s - loss: 0.1815 - accuracy: 0.9279 - val_loss: 0.5551 - val_accuracy: 0.7971 - 966ms/epoch - 19ms/step
Epoch 1/3
52/52 - 1s - loss: 0.1251 - accuracy: 0.9546 - val_loss: 0.8354 - val_accuracy: 0.7575 - 995ms/epoch - 19ms/step
Epoch 2/3
52/52 - 1s - loss: 0.0765 - accuracy: 0.9748 - val_loss: 0.6933 - val_accuracy: 0.8295 - 946ms/epoch - 18ms/step
Epoch 3/3
52/52 - 1s - loss: 0.0468 - accuracy: 0.9856 - val_loss: 0.7574 - val_accuracy: 0.8247 - 935ms/epoch - 18ms/step
Epoch 1/3
52/52 - 1s - loss: 0.0359

In [14]:
# Preprocess the input sentence
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Example input sentence
# input_sentence = "Congratulations! Your ability to state the obvious is truly remarkable"   #not sarcastic
input_sentence = "Oh wow, you figured that out all by yourself? You're a real genius."  #sarcastic


# Preprocess the input sentence
input_sentence = preprocess_text(input_sentence)

# Tokenize and pad the preprocessed sentence
input_sequence = tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=100)

# Predict the sentiment of the input sentence
prediction = (model.predict(input_sequence) > 0.5).astype(int)[0][0]

# Interpret the prediction
if prediction == 1:
    print("The sentence is sarcastic.")
else:
    print("The sentence is not sarcastic.")

The sentence is sarcastic.
