In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import pickle

In [56]:
train_data = pd.read_csv('train.tsv', sep='\t')
test_data = pd.read_csv('test.tsv', sep='\t')

print(train_data.columns)
print(test_data.columns)

Index(['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'], dtype='object')
Index(['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'], dtype='object')


In [57]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_data['combined_text'] = train_data['title'] + ' ' + train_data['text'] + ' ' + train_data['subject'] + ' ' + train_data['date'] 
test_data['combined_text'] = test_data['title'] + ' ' + test_data['text'] + ' ' + test_data['subject'] + ' ' + test_data['date']

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

X_train = train_data['combined_text']
y_train = train_data['label']
X_test = test_data['combined_text']
y_test = test_data['label']

In [58]:
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, min_samples_split=10, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=300, 
        learning_rate=0.05, 
        max_depth=3, 
        subsample=0.8, 
        random_state=42
    ),
    'Random Forest': RandomForestClassifier()
}

def evaluate_and_save_model(name, model, X_train, y_train, X_test, y_test):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, 'predict_proba') else None

    with open(f"{name.replace(' ', '_').lower()}_model.pkl", 'wb') as f:
        pickle.dump(pipeline, f)

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 'N/A'
    }

    print(f"\n{name} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

    return y_pred

In [59]:
results = {}
for name, classifier in classifiers.items():
    results[name] = evaluate_and_save_model(name, classifier, X_train, y_train, X_test, y_test)

def save_results(titles, results):
    with open('result.txt', 'w') as f:
        for model_name, predictions in results.items():
            f.write(f"{model_name} Results:\n")
            output = list(zip(titles, predictions))
            f.write(str(output) + '\n\n')

save_results(test_data['title'].tolist(), results)


Logistic Regression Metrics:
Accuracy: 0.9891133422039434
Precision: 0.9874780866516404
Recall: 0.9899573186040673
F1 Score: 0.9887161484453361
ROC-AUC: 0.9988212039355872

Decision Tree Metrics:
Accuracy: 0.9970968912543849
Precision: 0.9964885879107098
Recall: 0.9974893296510168
F1 Score: 0.9969887076537014
ROC-AUC: 0.9985696094489349

Gradient Boosting Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC-AUC: 1.0

Random Forest Metrics:
Accuracy: 0.9918954880851578
Precision: 0.9934475806451613
Recall: 0.9897062515691689
F1 Score: 0.9915733869953465
ROC-AUC: 0.9997077331225401
