In [1]:
# train_logistic_regression.py
import os, re, time, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

DATA_PATH = "data/2.synthetic_wisesight_like_thai_sentiment_hard_5000 (1).csv"
OUT_PATH = "outputs/LogisticRegression.joblib"

def clean(text):
    return re.sub(r"\s+", " ", str(text)).strip()

df = pd.read_csv(DATA_PATH)[["text", "sentiment"]].dropna()
df["text"] = df["text"].apply(clean)

Xtr, Xte, ytr, yte = train_test_split(
    df["text"], df["sentiment"],
    test_size=0.2, stratify=df["sentiment"], random_state=42
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=5,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        C=0.3,
        max_iter=3000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

pipe.fit(Xtr, ytr)
pred = pipe.predict(Xte)

acc = accuracy_score(yte, pred)
f1  = f1_score(yte, pred, average="macro")

joblib.dump({
    "pipeline": pipe,
    "model_name": "Logistic Regression",
    "model_version": "v1.0",
    "accuracy": acc,
    "macro_f1": f1,
    "trained_at": time.strftime("%Y-%m-%d %H:%M:%S")
}, OUT_PATH)

print("Logistic Regression v1.0")
print("Accuracy :", acc)
print("Macro-F1 :", f1)


Logistic Regression v1.0
Accuracy : 0.994
Macro-F1 : 0.9940260947371126
