In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/Malayalam/sarcasm_mal_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/Malayalam/sarcasm_mal_test_without_labels.csv')

In [3]:
# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

In [4]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

In [6]:
# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_padded, y_train_encoded, test_size=0.2, random_state=42)

In [7]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))



In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_split, y_train_split, epochs=10, batch_size=32, validation_data=(X_val_split, y_val_split), callbacks=[early_stopping])

Epoch 1/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.8087 - loss: 0.4712 - val_accuracy: 0.8491 - val_loss: 0.3930
Epoch 2/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 74ms/step - accuracy: 0.8870 - loss: 0.2769 - val_accuracy: 0.8408 - val_loss: 0.3900
Epoch 3/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 75ms/step - accuracy: 0.9187 - loss: 0.2068 - val_accuracy: 0.8366 - val_loss: 0.4203
Epoch 4/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 75ms/step - accuracy: 0.9396 - loss: 0.1570 - val_accuracy: 0.8173 - val_loss: 0.6319
Epoch 5/10
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.9577 - loss: 0.1101 - val_accuracy: 0.8165 - val_loss: 0.6335


In [10]:
# Vectorize the test data
X_test = test_df['Text']
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

In [11]:
# Make predictions on the test set
test_predictions_prob = model.predict(X_test_padded)
test_predictions = (test_predictions_prob > 0.5).astype(int)
test_predictions_labels = label_encoder.inverse_transform(test_predictions.flatten())

[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step


In [12]:
# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions_labels
output_path = '/Users/arunaa/Python/Sracasam/Malayalam/predictions_LSTM_mal.csv'
test_df.to_csv(output_path, index=False)

In [13]:
# Predict on the validation set
val_predictions_prob = model.predict(X_val_split)
val_predictions = (val_predictions_prob > 0.5).astype(int)
val_predictions_labels = label_encoder.inverse_transform(val_predictions.flatten())

[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step


In [14]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)

In [15]:
# Print the evaluation metrics and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Support: {support}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.8407884761182715
F1 Score: 0.8253099206097189
Precision: 0.823140517618854
Recall: 0.8407884761182715
Support: None

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      2142
           1       0.62      0.39      0.48       496

    accuracy                           0.84      2638
   macro avg       0.75      0.67      0.69      2638
weighted avg       0.82      0.84      0.83      2638



In [16]:
# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

                                                   Text Predicted_Labels
0     Shavakallarayile Kuzhimaadathile Peril Oru Let...    Non-sarcastic
1     ഗീതു മോഹൻദാസ് മലയാള സിനിമക്കു നൽകുന്ന വമ്പൻ ഗി...    Non-sarcastic
2                      Ente ponno ah sound🥰🥰 poli poli🤘    Non-sarcastic
3     Villain sharafudheen  ennu thonnunnavar likikk...    Non-sarcastic
4                    pulimurukan trailer ano kanunath 🤔    Non-sarcastic
...                                                 ...              ...
2821  Ente ponno oru adaaru jagapoka aanenu manasila...    Non-sarcastic
2822  എന്റെ ഇക്ക nja നമിച്ചു... ഒരു രക്ഷയില്ല ഹെവി ഐ...    Non-sarcastic
2823  ദേ ഇപ്പൊ കണ്ട് ഇറങ്ങിയതേ ഉള്ളു  96  Karikku (+...    Non-sarcastic
2824  1) Drisyam 2) Memories  3) Seconds 4) Grand ma...    Non-sarcastic
2825  Super mammoookkkaaa... ....   Oru lalettan bha...    Non-sarcastic

[2826 rows x 2 columns]
