In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_test_without_labels.csv')

In [3]:
# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

In [4]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)

In [5]:
# Train an SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [6]:
# Vectorize the test data
X_test = test_df['Text']
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Make predictions on the test set
test_predictions = svm_model.predict(X_test_tfidf)

In [8]:
# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions
output_path = '/Users/arunaa/Python/Sracasam/predictions_SVM.csv'
test_df.to_csv(output_path, index=False)

In [9]:
# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_val_tfidf = vectorizer.transform(X_val_split)

In [10]:
# Predict on the validation set
val_predictions = svm_model.predict(X_val_tfidf)

In [11]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [17]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)
print(classification_rep)

               precision    recall  f1-score   support

Non-sarcastic       0.85      0.95      0.89      4318
    Sarcastic       0.79      0.54      0.64      1596

     accuracy                           0.84      5914
    macro avg       0.82      0.74      0.77      5914
 weighted avg       0.83      0.84      0.83      5914



In [15]:
# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

                                                   Text Predicted_Labels
0         Kangana wow  awesome yr ye lakdi sbae alh hai    Non-sarcastic
1     விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...    Non-sarcastic
2     திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...    Non-sarcastic
3     இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...    Non-sarcastic
4     dai thala pera sonnalay summa tamil naday athi...    Non-sarcastic
...                                                 ...              ...
6333                      NTR _ Ajith mutuals like here    Non-sarcastic
6334  aiyo #thala marana mass #thala love you so muc...        Sarcastic
6335                      Yan kadavula I love you thala    Non-sarcastic
6336  Thank you vijay sethupathi....for acted at syr...    Non-sarcastic
6337    Amitab and taapsi manu ki copy picture bnai h y    Non-sarcastic

[6338 rows x 2 columns]
