In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_test_without_labels.csv')

In [3]:
# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

In [4]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

In [6]:
# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_padded, y_train_encoded, test_size=0.2, random_state=42)

In [8]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))

In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_split, y_train_split, epochs=10, batch_size=32, validation_data=(X_val_split, y_val_split), callbacks=[early_stopping])

Epoch 1/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 77ms/step - accuracy: 0.7580 - loss: 0.5094 - val_accuracy: 0.7890 - val_loss: 0.4458
Epoch 2/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 74ms/step - accuracy: 0.8289 - loss: 0.3764 - val_accuracy: 0.7917 - val_loss: 0.4494
Epoch 3/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 74ms/step - accuracy: 0.8499 - loss: 0.3327 - val_accuracy: 0.7869 - val_loss: 0.4718
Epoch 4/10
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 74ms/step - accuracy: 0.8725 - loss: 0.2829 - val_accuracy: 0.7837 - val_loss: 0.5271


In [11]:
# Vectorize the test data
X_test = test_df['Text']
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

In [12]:
# Make predictions on the test set
test_predictions_prob = model.predict(X_test_padded)
test_predictions = (test_predictions_prob > 0.5).astype(int)
test_predictions_labels = label_encoder.inverse_transform(test_predictions.flatten())

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step


In [14]:
# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions_labels
output_path = '/Users/arunaa/Python/Sracasam/predictions_LSTM.csv'
test_df.to_csv(output_path, index=False)

In [15]:
# Predict on the validation set
val_predictions_prob = model.predict(X_val_split)
val_predictions = (val_predictions_prob > 0.5).astype(int)
val_predictions_labels = label_encoder.inverse_transform(val_predictions.flatten())

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step


In [16]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)

In [17]:
# Print the evaluation metrics and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Support: {support}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.7889753128170442
F1 Score: 0.7745064058536517
Precision: 0.7753113710410229
Recall: 0.7889753128170442
Support: None

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.86      4318
           1       0.66      0.45      0.53      1596

    accuracy                           0.79      5914
   macro avg       0.74      0.68      0.70      5914
weighted avg       0.78      0.79      0.77      5914



In [18]:
# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

                                                   Text Predicted_Labels
0         Kangana wow  awesome yr ye lakdi sbae alh hai    Non-sarcastic
1     விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...    Non-sarcastic
2     திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...    Non-sarcastic
3     இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...    Non-sarcastic
4     dai thala pera sonnalay summa tamil naday athi...    Non-sarcastic
...                                                 ...              ...
6333                      NTR _ Ajith mutuals like here    Non-sarcastic
6334  aiyo #thala marana mass #thala love you so muc...    Non-sarcastic
6335                      Yan kadavula I love you thala    Non-sarcastic
6336  Thank you vijay sethupathi....for acted at syr...    Non-sarcastic
6337    Amitab and taapsi manu ki copy picture bnai h y    Non-sarcastic

[6338 rows x 2 columns]
