In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# Load data
training_data = pd.read_csv("data/si670_kaggle1_train.csv").set_index('id').sample(n=100000)
validation_data = pd.read_csv("data/si670_kaggle1_validation.csv").set_index('id')
test_data = pd.read_csv("data/test.csv").set_index('id')

# Extract features and labels
X_train, y_train = training_data["text"], training_data["label"]
X_val, y_val = validation_data["text"], validation_data["label"]
X_test = test_data["text"]

In [None]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
      return self
    def transform(self, X):
      return X.apply(lambda t: re.sub(r"\d+", "<NUM>", t.lower()))

In [None]:
features = FeatureUnion([
    ("word", TfidfVectorizer(max_features=100000, ngram_range=(1,3), sublinear_tf=True, max_df=0.95)),
    ("char", TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=50000))
])

In [None]:
pipeline = Pipeline([
    ("cleaner", TextCleaner()),
    ("features", features),
    ("clf", CalibratedClassifierCV(LinearSVC(class_weight="balanced", C=1.0), method="sigmoid", cv=3))
])

pipeline.fit(X_train, y_train)

In [None]:
probs_val = pipeline.predict_proba(X_val)[:, 1]

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_val, probs_val)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.nanargmax(f1_scores)
best_threshold = thresholds[best_idx]

val_preds = (probs_val >= best_threshold).astype(int)
accuracy = accuracy_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds)
print(f"Best Threshold: {best_threshold:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

Best Threshold: 0.1713
Validation Accuracy: 0.7411, F1 Score: 0.7748


In [None]:
probs_test = pipeline.predict_proba(X_test)[:, 1]
final_predictions = (probs_test >= best_threshold).astype(int)

print('Writing to csv file...')
with open("si670_kaggle1_predictions.csv", "w") as f:
    f.write("id,label\n")
    for index, prediction in zip(X_test.index, final_predictions):
        f.write(f"{index},{prediction}\n")
print('Done')