In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

In [7]:
# Load datasets from Google Drive
train_data_path = '/content/drive/MyDrive/Saracastic/sarcasm_tam_train.csv'
test_data_path = '/content/drive/MyDrive/Saracastic/sarcasm_tam_test_without_labels.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [8]:
# Display the first few rows of the datasets
print(train_data.head())
print(test_data.head())

                                                Text         labels
0  அர்யவுக்கு  ஒரு நல்ல வாய்ப்பு சிங்கம் சூரியா அ...  Non-sarcastic
1  பள்ளியோ கல்லூரியோ படித்துக்கொண்டிருக்கும்போது ...  Non-sarcastic
2  தல தல தல தல தல தல தல தல தல தல தல தல தல தல தல த...  Non-sarcastic
3  All the best to the team....super ah na Oru ka...  Non-sarcastic
4  Bahut over mack up kiya hua hai, it is look li...  Non-sarcastic
      ID                                               Text
0  Id_01      Kangana wow  awesome yr ye lakdi sbae alh hai
1  Id_02  விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...
2  Id_03  திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...
3  Id_04  இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...
4  Id_05  dai thala pera sonnalay summa tamil naday athi...


In [9]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [10]:
# Extract features and labels
texts = train_data['Text'].astype(str).values
labels = train_data['labels'].astype(str).values

In [11]:
# Encode labels (sarcastic/non-sarcastic)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [12]:
# Tokenize and pad the sequences
vocab_size = 10000
embedding_dim = 100
max_length = 100
oov_tok = "<OOV>"

In [13]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [14]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels_encoded, test_size=0.2, random_state=42)

In [15]:
# DAN Model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
# Training
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), verbose=2)

Epoch 1/10
740/740 - 23s - loss: 0.5470 - accuracy: 0.7365 - val_loss: 0.4967 - val_accuracy: 0.7301 - 23s/epoch - 31ms/step
Epoch 2/10
740/740 - 13s - loss: 0.4419 - accuracy: 0.7786 - val_loss: 0.4480 - val_accuracy: 0.7883 - 13s/epoch - 17ms/step
Epoch 3/10
740/740 - 14s - loss: 0.3824 - accuracy: 0.8251 - val_loss: 0.4557 - val_accuracy: 0.7886 - 14s/epoch - 19ms/step
Epoch 4/10
740/740 - 17s - loss: 0.3452 - accuracy: 0.8478 - val_loss: 0.4535 - val_accuracy: 0.7912 - 17s/epoch - 23ms/step
Epoch 5/10
740/740 - 20s - loss: 0.3192 - accuracy: 0.8617 - val_loss: 0.4692 - val_accuracy: 0.7902 - 20s/epoch - 27ms/step
Epoch 6/10
740/740 - 12s - loss: 0.2983 - accuracy: 0.8712 - val_loss: 0.4934 - val_accuracy: 0.7940 - 12s/epoch - 17ms/step
Epoch 7/10
740/740 - 13s - loss: 0.2818 - accuracy: 0.8789 - val_loss: 0.5141 - val_accuracy: 0.7873 - 13s/epoch - 17ms/step
Epoch 8/10
740/740 - 12s - loss: 0.2672 - accuracy: 0.8868 - val_loss: 0.5540 - val_accuracy: 0.7918 - 12s/epoch - 16ms/step


<keras.src.callbacks.History at 0x7b3221c6acb0>

In [28]:
# Preprocess the test data
test_texts = test_data['Text'].astype(str).values
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [29]:
# Prediction and Evaluation
val_pred = model.predict(X_val)
val_pred = (val_pred > 0.5).astype(int)



In [30]:
accuracy = accuracy_score(y_val, val_pred)
precision = precision_score(y_val, val_pred)
recall = recall_score(y_val, val_pred)
f1 = f1_score(y_val, val_pred)
classification_rep = classification_report(y_val, val_pred, target_names=label_encoder.classes_)

In [31]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(classification_rep)

Accuracy: 0.7839026039905309
Precision: 0.6193693693693694
Recall: 0.5169172932330827
F1 Score: 0.5635245901639343
               precision    recall  f1-score   support

Non-sarcastic       0.83      0.88      0.86      4318
    Sarcastic       0.62      0.52      0.56      1596

     accuracy                           0.78      5914
    macro avg       0.73      0.70      0.71      5914
 weighted avg       0.77      0.78      0.78      5914



In [32]:
# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(val_pred.flatten())

In [34]:
# Print the predictions
for i, text in enumerate(test_texts):
    print(f"Text: {text}\nPredicted Label: {predicted_labels[i]}\n")

Text: Kangana wow  awesome yr ye lakdi sbae alh hai
Predicted Label: Sarcastic

Text: விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக்கள்
Predicted Label: Non-sarcastic

Text: திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம் சார்பாக வாழ்த்துக்கள்
Predicted Label: Non-sarcastic

Text: இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இந்த சமுதாயம் பிரித்து வைத்த முறைபடி ராகுல் புல்லார் பறையரின் மனமார்ந்த வாழ்த்துக்கள்..
Predicted Label: Non-sarcastic

Text: dai thala pera sonnalay summa tamil naday athirum daa
Predicted Label: Non-sarcastic

Text: enakku koncham payama irukku sankar thalavar ah vachi cartoon padam ethum kattirakoodathu :(
Predicted Label: Sarcastic

Text: Thalaivaa vaa vaa .....Thaa evanum kitta vara kudadhu
Predicted Label: Non-sarcastic

Text: Haha Sema documentary thetre ku varudhu polayae
Predicted Label: Non-sarcastic

Text: Surya Ku pair illa ya ?
Predicted Label: Non-sarcastic

Text: Kaithi trailer yen trending la varala
Predicted Label: Non-sarcastic

Text: Neenga ept p

IndexError: index 5914 is out of bounds for axis 0 with size 5914