In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional, TimeDistributed, Dropout, Layer
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_test_without_labels.csv')

In [3]:
# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

In [4]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

In [6]:
# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_padded, y_train_encoded, test_size=0.2, random_state=42)

In [7]:
# Define the Attention layer
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], input_shape[-1]), initializer='uniform', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[-1],), initializer='uniform', trainable=True)
        self.u = self.add_weight(name='att_var', shape=(input_shape[-1],), initializer='uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        u_it = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.exp(K.sum(K.dot(u_it, K.expand_dims(self.u, -1)), axis=-1))
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        return K.sum(weighted_input, axis=1)

In [9]:
# Build the HAN model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=100)(input_layer)
lstm_layer = Bidirectional(LSTM(units=128, return_sequences=True))(embedding_layer)
attention_layer = AttentionLayer()(lstm_layer)
dropout_layer = Dropout(0.5)(attention_layer)
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_split, y_train_split, epochs=10, batch_size=32, validation_data=(X_val_split, y_val_split), callbacks=[early_stopping])

Epoch 1/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 103ms/step - accuracy: 0.7456 - loss: 0.5325 - val_accuracy: 0.7900 - val_loss: 0.4442
Epoch 2/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 126ms/step - accuracy: 0.8264 - loss: 0.3809 - val_accuracy: 0.7940 - val_loss: 0.4400
Epoch 3/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 122ms/step - accuracy: 0.8549 - loss: 0.3278 - val_accuracy: 0.7856 - val_loss: 0.4682
Epoch 4/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 121ms/step - accuracy: 0.8772 - loss: 0.2871 - val_accuracy: 0.7748 - val_loss: 0.4930
Epoch 5/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 121ms/step - accuracy: 0.8934 - loss: 0.2515 - val_accuracy: 0.7775 - val_loss: 0.5804


In [11]:
# Vectorize the test data
X_test = test_df['Text']
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

In [12]:
# Make predictions on the test set
test_predictions_prob = model.predict(X_test_padded)
test_predictions = (test_predictions_prob > 0.5).astype(int)
test_predictions_labels = label_encoder.inverse_transform(test_predictions.flatten())

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step


In [14]:
# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions_labels
output_path = '/Users/arunaa/Python/Sracasam/predictions_HAN.csv'
test_df.to_csv(output_path, index=False)

In [15]:
# Predict on the validation set
val_predictions_prob = model.predict(X_val_split)
val_predictions = (val_predictions_prob > 0.5).astype(int)
val_predictions_labels = label_encoder.inverse_transform(val_predictions.flatten())

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step


In [16]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)

In [18]:
# Print the evaluation metrics and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.7940480216435577
F1 Score: 0.7864257651582147
Precision: 0.7841086116420873
Recall: 0.7940480216435577

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      4318
           1       0.65      0.52      0.58      1596

    accuracy                           0.79      5914
   macro avg       0.74      0.71      0.72      5914
weighted avg       0.78      0.79      0.79      5914



In [19]:
# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

                                                   Text Predicted_Labels
0         Kangana wow  awesome yr ye lakdi sbae alh hai    Non-sarcastic
1     விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...    Non-sarcastic
2     திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...    Non-sarcastic
3     இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...    Non-sarcastic
4     dai thala pera sonnalay summa tamil naday athi...    Non-sarcastic
...                                                 ...              ...
6333                      NTR _ Ajith mutuals like here    Non-sarcastic
6334  aiyo #thala marana mass #thala love you so muc...        Sarcastic
6335                      Yan kadavula I love you thala        Sarcastic
6336  Thank you vijay sethupathi....for acted at syr...    Non-sarcastic
6337    Amitab and taapsi manu ki copy picture bnai h y    Non-sarcastic

[6338 rows x 2 columns]
