In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_train.csv')
test_df = pd.read_csv('/Users/arunaa/Python/Sracasam/sarcasm_tam_test_without_labels.csv')

# Separate features and labels
X_train = train_df['Text']
y_train = train_df['labels']

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Vectorize the text data using TF-IDF or CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(test_df['Text'])

# Split the data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_vectorized, y_train_encoded, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_clf.fit(X_train_split, y_train_split)

# Make predictions on the validation set
val_predictions = gb_clf.predict(X_val_split)

# Evaluate the model
accuracy = accuracy_score(y_val_split, val_predictions)
precision, recall, f1, support = precision_recall_fscore_support(y_val_split, val_predictions, average='weighted')
classification_rep = classification_report(y_val_split, val_predictions)

# Print the evaluation metrics and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Support: {support}")
print("\nClassification Report:\n", classification_rep)

# Make predictions on the test set
test_predictions = gb_clf.predict(X_test_vectorized)

# Convert predictions back to original labels
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Save predictions to CSV file
test_df['Predicted_Labels'] = test_predictions_labels
output_path = '/Users/arunaa/Python/Sracasam/predictions_GB.csv.csv'
test_df.to_csv(output_path, index=False)

# Print the dataset with text and predicted labels
print(test_df[['Text', 'Predicted_Labels']])

print(f"Predictions saved to {output_path}")


Accuracy: 0.7578626986810957
F1 Score: 0.6974148567929362
Precision: 0.7498483479460821
Recall: 0.7578626986810957
Support: None

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.98      0.85      4318
           1       0.72      0.17      0.27      1596

    accuracy                           0.76      5914
   macro avg       0.74      0.57      0.56      5914
weighted avg       0.75      0.76      0.70      5914

                                                   Text Predicted_Labels
0         Kangana wow  awesome yr ye lakdi sbae alh hai    Non-sarcastic
1     விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...    Non-sarcastic
2     திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...    Non-sarcastic
3     இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...    Non-sarcastic
4     dai thala pera sonnalay summa tamil naday athi...    Non-sarcastic
...                                                 ...              ...
6333    