In [4]:
import pandas as pd
import os
import joblib

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [5]:
MODEL_FILE = "spam_model.pkl"
PIPELINE_FILE = "spam_pipeline.pkl"
DATA_FILE = "spam.csv"


# Build Pipeline:

def build_pipeline():
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=5000
        )),
        
        ("model", LinearSVC(
            class_weight="balanced",
            random_state=42
        ))
    ])



# Train Model:

def train():
    df = pd.read_csv("spam.csv", encoding="latin-1")
    
    df = df[["label", "message"]]
    
    df.columns = df.columns.str.lower().str.strip()
    
    df = df.dropna(subset=["label", "message"])

    df["label"] = df["label"].map({"spam": 1, "ham": 0})

    X = df["message"]
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    pipeline = build_pipeline()
    pipeline.fit(X_train, y_train)

    joblib.dump(pipeline, PIPELINE_FILE)
    joblib.dump(pipeline.named_steps["model"], MODEL_FILE)

    X_test.to_csv("input.csv", index=False)

    print("Model trained and saved successfully!")



# Inference:

def inference(input_csv="input.csv"):
    if not os.path.exists(input_csv):
        raise FileNotFoundError("input.csv not found!")

    pipeline = joblib.load(PIPELINE_FILE)
    data = pd.read_csv(input_csv)

    predictions = pipeline.predict(data["message"])

    # FIX: numpy → pandas Series
    data["predicted_label"] = pd.Series(predictions).map({
        1: "Spam",
        0: "Ham"
    })

    data.to_csv("output.csv", index=False)
    print("✅ Inference completed. Output saved to output.csv")



# Entry Point:

if __name__ == "__main__":
    if not os.path.exists(MODEL_FILE):
        train()
    else:
        inference()

✅ Inference completed. Output saved to output.csv
