In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB  # Changed to GaussianNB

In [None]:
# Load datasets
train_df = pd.read_csv('tamil_train.csv')
dev_df = pd.read_csv('tamil_dev.csv')
test_df = pd.read_csv('tamil_test.csv')

In [None]:
# Prepare data
X_train = train_df['text']
y_train = train_df['label']
X_dev = dev_df['text']
y_dev = dev_df['label']
X_test = test_df['text']

In [None]:
# TF-IDF Vectorization (convert to dense arrays)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()  # Convert to dense
X_dev_tfidf = vectorizer.transform(X_dev).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [None]:
# Initialize models with GaussianNB
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "SGD": SGDClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

In [None]:
# Train and evaluate models
results = []
predictions = {}

for name, model in models.items():
    # Train model
    model.fit(X_train_tfidf, y_train)

    # Predict on dev set
    dev_preds = model.predict(X_dev_tfidf)

    # Calculate metrics
    precision = precision_score(y_dev, dev_preds, average='macro')
    recall = recall_score(y_dev, dev_preds, average='macro')
    f1 = f1_score(y_dev, dev_preds, average='macro')

    results.append({
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

    # Store test predictions
    predictions[name] = model.predict(X_test_tfidf)

In [None]:

# Print validation results
print("\nValidation Results:")
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Create prediction dataframe and save to CSV
prediction_df = pd.DataFrame(predictions)
prediction_df.to_csv('test_predictions.csv', index=False)
print("\nTest predictions saved to test_predictions.csv")


Validation Results:
               Model  Precision   Recall  F1-score
 Logistic Regression   0.725598 0.641070  0.639221
                 SVM   0.810495 0.738721  0.751081
                 KNN   0.653742 0.533580  0.467296
Gaussian Naive Bayes   0.639669 0.638588  0.602264
                 SGD   0.713173 0.699420  0.703877
       Random Forest   0.824672 0.774957  0.787484
       Decision Tree   0.770400 0.775609  0.772602

Test predictions saved to test_predictions.csv
