In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/Malayalam/sarcasm_mal_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/Malayalam/sarcasm_mal_test_without_labels.csv')

In [5]:
# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

In [6]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)

In [7]:
# Train an SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [8]:
# Vectorize the test data
X_test = test_df['Text']
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
# Make predictions on the test set
test_predictions = svm_model.predict(X_test_tfidf)

In [10]:
# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions
output_path = '/Users/arunaa/Python/Sracasam/Malayalam/predictions_SVM_mal.csv'
test_df.to_csv(output_path, index=False)

In [11]:
# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_val_tfidf = vectorizer.transform(X_val_split)

In [12]:
# Predict on the validation set
val_predictions = svm_model.predict(X_val_tfidf)

In [13]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')

In [17]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [19]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)
print(classification_rep)

               precision    recall  f1-score   support

Non-sarcastic       0.87      0.99      0.93      2142
    Sarcastic       0.90      0.37      0.52       496

     accuracy                           0.87      2638
    macro avg       0.88      0.68      0.73      2638
 weighted avg       0.88      0.87      0.85      2638



In [20]:
# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

                                                   Text Predicted_Labels
0     Shavakallarayile Kuzhimaadathile Peril Oru Let...    Non-sarcastic
1     ഗീതു മോഹൻദാസ് മലയാള സിനിമക്കു നൽകുന്ന വമ്പൻ ഗി...    Non-sarcastic
2                      Ente ponno ah sound🥰🥰 poli poli🤘    Non-sarcastic
3     Villain sharafudheen  ennu thonnunnavar likikk...    Non-sarcastic
4                    pulimurukan trailer ano kanunath 🤔    Non-sarcastic
...                                                 ...              ...
2821  Ente ponno oru adaaru jagapoka aanenu manasila...    Non-sarcastic
2822  എന്റെ ഇക്ക nja നമിച്ചു... ഒരു രക്ഷയില്ല ഹെവി ഐ...    Non-sarcastic
2823  ദേ ഇപ്പൊ കണ്ട് ഇറങ്ങിയതേ ഉള്ളു  96  Karikku (+...    Non-sarcastic
2824  1) Drisyam 2) Memories  3) Seconds 4) Grand ma...    Non-sarcastic
2825  Super mammoookkkaaa... ....   Oru lalettan bha...    Non-sarcastic

[2826 rows x 2 columns]


In [21]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13188 entries, 0 to 13187
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    13188 non-null  object
 1   labels  13188 non-null  object
dtypes: object(2)
memory usage: 206.2+ KB
None


In [41]:
print(train_df.head())
print(train_df.info())

                                                Text         labels
0           Screenshot edukkan vannth njan മാത്രമാണോ      Sarcastic
1  നമ്മുടെ അനു സിത്താര ചേച്ചി ഇങ്ങനെ വരുന്നത് നോക...      Sarcastic
2             Mollyhood is getting bigger and bigger  Non-sarcastic
3     Ho aaa BGM. Mammookka ithu oru pwoli pwolikkum  Non-sarcastic
4  Enthaale, sambhavam puraanam aanelum backgroun...      Sarcastic
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13188 entries, 0 to 13187
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    13188 non-null  object
 1   labels  13188 non-null  object
dtypes: object(2)
memory usage: 206.2+ KB
None
