## **5. Predict on Validation Data**

5.1 Importing the corresponding libraries and defining the path directory of the files

In [1]:
import os, csv, joblib, pandas as pd
from typing import Optional, List, Tuple
TFIDF_PATH = "tfidf_vectorizer.pkl"
MODEL_PATH = "svm_tfidf.pkl"

VAL_READY  = "validation_ready.csv" 
VAL_RAW    = "validation_data.csv"  
OUT_PATH   = "validation_data_predicted.csv"

CAND_TEXT_COLS: List[str] = ["text", "content", "article", "body", "full_text", "title"]

5.2 

In [2]:
# ---------- helpers ----------
def sniff_delimiter(path: str, default: str = ",") -> str:
    try:
        with open(path, "r", encoding="utf-8") as f:
            sample = f.read(2048)
        return csv.Sniffer().sniff(sample).delimiter
    except Exception:
        return default

def find_text_column(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None

def ensure_text_clean(df_in: pd.DataFrame, text_col: Optional[str]) -> Tuple[pd.DataFrame, str]:
    df = df_in.copy()
    if "text_clean" in df.columns:
        return df, "text_clean"

    chosen = text_col or find_text_column(df, CAND_TEXT_COLS)
    if chosen is None:
        raise ValueError(
            f"Couldn't find a text column. Looked for {CAND_TEXT_COLS}. "
            f"Columns in your file: {list(df.columns)}"
        )

    
    if "text_preprocessing" in globals():
        df = text_preprocessing(df, text_col=chosen)
        if "text_clean" not in df.columns:
            raise KeyError("text_preprocessing() did not create 'text_clean'.")
        return df, "text_clean"

    # Minimal fallback cleaner (only if your pipeline isn't in memory)
    import re, string
    def _basic_clean_txt(s):
        s = "" if s is None else str(s)
        s = s.lower()
        s = re.sub(r"http\S+|www\.\S+", " ", s)
        s = re.sub(f"[{re.escape(string.punctuation)}]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    df["text_clean"] = df[chosen].map(_basic_clean_txt)
    return df, "text_clean"

def load_artifacts(tfidf_path: str, model_path: str):
    tfidf = joblib.load(tfidf_path)
    model = joblib.load(model_path)
    return tfidf, model

def predict_labels(df_proc: pd.DataFrame, tfidf, model, clean_col: str = "text_clean") -> pd.Series:
    X = tfidf.transform(df_proc[clean_col].fillna(""))
    preds = model.predict(X)
    return pd.Series(preds, index=df_proc.index, name="label_pred")

def apply_predictions_replace_twos(df_orig: pd.DataFrame, preds: pd.Series) -> pd.DataFrame:
    """
    Return a copy where ONLY rows with label==2 are replaced with predictions (0/1).
    If 'label' doesn't exist, it will be created from preds.
    """
    df_out = df_orig.copy()
    if "label" not in df_out.columns:
        df_out["label"] = preds.values
        return df_out
    mask = (df_out["label"] == 2)
    df_out.loc[mask, "label"] = preds.loc[mask].values
    return df_out

def write_like_input(df_out: pd.DataFrame, df_input: pd.DataFrame, input_path: str, out_path: str):
    
    original_cols = list(df_input.columns)
    cols_to_write = original_cols[:] if "label" in original_cols else original_cols + ["label"]

    os.makedirs(os.path.dirname(out_path), exist_ok=True) if os.path.dirname(out_path) else None
    delim = sniff_delimiter(input_path, default=",")

    df_out[cols_to_write].to_csv(
        out_path,
        sep=delim,
        index=False,
        lineterminator="\n",
        quoting=csv.QUOTE_MINIMAL
    )
    return out_path

# ---------- main ----------
def predict_and_replace_on_validation():
    tfidf, model = load_artifacts(TFIDF_PATH, MODEL_PATH)

    # Prefer the ready file if it exists; otherwise use the raw one
    if os.path.exists(VAL_READY):
        src_path = VAL_READY
    elif os.path.exists(VAL_RAW):
        src_path = VAL_RAW
    else:
        raise FileNotFoundError(f"Neither {VAL_READY} nor {VAL_RAW} was found.")

    df_in = pd.read_csv(src_path, sep=None, engine="python")

    # Build a 'text_clean' column (or reuse it)
    df_proc, clean_col = ensure_text_clean(df_in, text_col=None)

    # Predict
    preds = predict_labels(df_proc, tfidf, model, clean_col=clean_col)

    # Replace only the 2s (or create label if missing)
    df_out = apply_predictions_replace_twos(df_in, preds)

    # Save preserving original format
    out_path = OUT_PATH if OUT_PATH else src_path
    out_path = write_like_input(df_out, df_in, src_path, out_path)

    # Small sanity report
    print(f"[OK] Saved predictions -> {out_path}")
    if "label" in df_out.columns:
        print("Label counts:", df_out["label"].value_counts().sort_index().to_dict())

# run it
predict_and_replace_on_validation()

[OK] Saved predictions -> validation_data_predicted.csv
Label counts: {0: 3465, 1: 1491}


Key (0) and (1) are the two classes that my model predicts. This means 0= Fake and 1= Real news. 
- 3,442 rows were predicted as class 0
- 1,491 rows were predicted as class 1