In [3]:
import os, re, json, glob, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score
from joblib import dump, load

warnings.filterwarnings("ignore")
np.random.seed(42)

ARTIFACT_DIR = "./artifacts"
BACKUP_DIR   = "./backups"
EDA_DIR      = "./eda_exports"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(BACKUP_DIR, exist_ok=True)
os.makedirs(EDA_DIR, exist_ok=True)

print("✅ Setup ready")


✅ Setup ready


In [4]:
CSV_PATH  = os.path.expanduser("~/Desktop/Spicejet ALL FAQ.csv")
XLSX_PATH = os.path.expanduser("~/Desktop/Spicejet ALL FAQ.xlsx")

if   os.path.exists(CSV_PATH):  file_path = CSV_PATH
elif os.path.exists(XLSX_PATH): file_path = XLSX_PATH
else:
    # fuzzy search fallback
    matches = glob.glob(os.path.expanduser("~/Desktop/Spicejet*FAQ.*"))
    file_path = matches[0] if matches else None

assert file_path is not None, "❌ File not found on Desktop."
file_path


'/Users/akshitaantil/Desktop/Spicejet ALL FAQ.csv'

In [5]:
ext = os.path.splitext(file_path)[1].lower()
if ext == ".csv":
    df_raw = pd.read_csv(file_path, encoding="utf-8-sig")
elif ext in (".xlsx", ".xls"):
    df_raw = pd.read_excel(file_path)
else:
    raise ValueError(f"Unsupported type: {ext}")

backup_path = os.path.join(BACKUP_DIR, "raw_backup.csv")
df_raw.to_csv(backup_path, index=False, encoding="utf-8-sig")
print("✅ Raw backup saved:", backup_path, "Shape:", df_raw.shape)
df_raw.head(10)


✅ Raw backup saved: ./backups/raw_backup.csv Shape: (195, 5)


Unnamed: 0,Category,Intent,Domestic/international,Question,Answer
0,General,general_faq,Domestic,What is it like to fly with SpiceJet ?,"Flying with SpiceJet is FUN, AFFORDABLE & EXCI..."
1,General,general_faq,Domestic,Does SpiceJet provide any food or drinks onboard?,"Variety of hot meals, sandwiches, chef's choic..."
2,General,general_faq,Domestic,Where does SpiceJet fly to?,SpiceJet connects you to wide network of desti...
3,General,general_faq,Domestic,What type of aircraft does SpiceJet fly?,"SpiceJet operates a fleet of Boeing (B737-700,..."
4,General,general_faq,Domestic,When will SpiceJet be flying to new destinations?,SpiceJetâ€™s aim is to provide affordable effi...
5,General,general_faq,Domestic,What if I just want to obtain flights schedule...,Passengers may view latest flight schedule fro...
6,General,general_faq,Domestic,Do I need any documents to enter the airport?,"Yes, you would need your flight itinerary and ..."
7,General,general_faq,Domestic,How do I check-in for the flight?,For a swift airport experience and to avoid po...
8,General,general_faq,Domestic,What valid ID proof would be accepted at the a...,All Foreign Nationals/Non-Indian Nationals/NRI...
9,General,general_faq,Domestic,Does SpiceJet offer connections to other airli...,"No, SpiceJet does not offer connections to oth..."


In [6]:
df = df_raw.copy()

cands = {
    "question": ["question","Question","questions","query","Query","Q"],
    "answer":   ["answer","Answer","answers","response","Response","A","Ans"],
    "intent":   ["intent","Intent","label","Label","class","Class","CATEGORY","Category"]
}
def pick(name_list, cols): 
    for c in name_list:
        if c in cols: return c

cols = set(df.columns)
q_col = pick(cands["question"], cols)
a_col = pick(cands["answer"], cols)
i_col = pick(cands["intent"], cols)  # may be None

assert q_col and a_col, "❌ Need at least question & answer columns."

keep = [q_col, a_col] + ([i_col] if i_col else [])
df = df[keep].copy()
df.columns = ["question","answer"] + (["intent"] if i_col else [])
print("✅ Using columns:", df.columns.tolist(), "Rows:", len(df))
df.head(5)


✅ Using columns: ['question', 'answer', 'intent'] Rows: 195


Unnamed: 0,question,answer,intent
0,What is it like to fly with SpiceJet ?,"Flying with SpiceJet is FUN, AFFORDABLE & EXCI...",general_faq
1,Does SpiceJet provide any food or drinks onboard?,"Variety of hot meals, sandwiches, chef's choic...",general_faq
2,Where does SpiceJet fly to?,SpiceJet connects you to wide network of desti...,general_faq
3,What type of aircraft does SpiceJet fly?,"SpiceJet operates a fleet of Boeing (B737-700,...",general_faq
4,When will SpiceJet be flying to new destinations?,SpiceJetâ€™s aim is to provide affordable effi...,general_faq


In [7]:
eda = {
    "rows": len(df),
    "columns": df.columns.tolist(),
    "null_counts": df.isnull().sum().to_dict(),
    "question_len": {
        "min": int(df["question"].astype(str).str.len().min()),
        "max": int(df["question"].astype(str).str.len().max()),
        "mean": float(df["question"].astype(str).str.len().mean())
    },
    "answer_len": {
        "min": int(df["answer"].astype(str).str.len().min()),
        "max": int(df["answer"].astype(str).str.len().max()),
        "mean": float(df["answer"].astype(str).str.len().mean())
    }
}
if "intent" in df.columns:
    eda["n_classes"] = int(df["intent"].nunique())
    eda["top_classes"] = df["intent"].astype(str).value_counts().head(10).to_dict()

eda_path = os.path.join(EDA_DIR, "eda_summary.json")
with open(eda_path, "w", encoding="utf-8") as f:
    json.dump(eda, f, indent=2, ensure_ascii=False)

print("✅ EDA saved to", eda_path)
eda


✅ EDA saved to ./eda_exports/eda_summary.json


{'rows': 195,
 'columns': ['question', 'answer', 'intent'],
 'null_counts': {'question': 0, 'answer': 0, 'intent': 104},
 'question_len': {'min': 17, 'max': 181, 'mean': 55.0974358974359},
 'answer_len': {'min': 4, 'max': 2151, 'mean': 445.05128205128204},
 'n_classes': 3,
 'top_classes': {'nan': 104,
  'booking_faq': 43,
  'general_faq': 31,
  'specialAssistance_faq': 17}}

In [8]:
import re

def normalize_text(s):
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s./%₹$-]", " ", s)  # keep useful symbols
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["question_clean"] = df["question"].map(normalize_text)
df["answer_clean"]   = df["answer"].map(normalize_text)

print("✅ Clean columns added.")
df[["question","question_clean","answer","answer_clean"]].head(5)


✅ Clean columns added.


Unnamed: 0,question,question_clean,answer,answer_clean
0,What is it like to fly with SpiceJet ?,what is it like to fly with spicejet,"Flying with SpiceJet is FUN, AFFORDABLE & EXCI...",flying with spicejet is fun affordable excitin...
1,Does SpiceJet provide any food or drinks onboard?,does spicejet provide any food or drinks onboard,"Variety of hot meals, sandwiches, chef's choic...",variety of hot meals sandwiches chef s choices...
2,Where does SpiceJet fly to?,where does spicejet fly to,SpiceJet connects you to wide network of desti...,spicejet connects you to wide network of desti...
3,What type of aircraft does SpiceJet fly?,what type of aircraft does spicejet fly,"SpiceJet operates a fleet of Boeing (B737-700,...",spicejet operates a fleet of boeing b737-700 b...
4,When will SpiceJet be flying to new destinations?,when will spicejet be flying to new destinations,SpiceJetâ€™s aim is to provide affordable effi...,spicejetâ s aim is to provide affordable effic...


In [9]:
if "intent" not in df.columns:
    def canon_answer(s):
        s = normalize_text(s)
        s = re.sub(r"\b(spicejet|hello|dear customer|thanks|regards)\b", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s
    canon = df["answer_clean"].map(canon_answer)
    mapping = {k:i for i,k in enumerate(sorted(canon.unique()))}
    df["intent"] = canon.map(lambda x: f"intent_{mapping[x]:03d}")

print("✅ Intents ready. Classes:", df["intent"].nunique())
df["intent"].value_counts().head(10)


✅ Intents ready. Classes: 3


intent
booking_faq              43
general_faq              31
specialAssistance_faq    17
Name: count, dtype: int64

In [10]:
# --- PATCH: normalize/repair intents so they are all non-null strings ---

import re
def normalize_text(s):
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s./%₹$-]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def canon_answer(s):
    s = normalize_text(s)
    s = re.sub(r"\b(spicejet|hello|dear customer|thanks|regards)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Ensure we have clean helper cols (won't overwrite originals)
if "answer_clean" not in df.columns:
    df["answer_clean"] = df["answer"].map(normalize_text)

# 1) Find missing/empty intents
if "intent" not in df.columns:
    df["intent"] = np.nan

missing_mask = df["intent"].isna() | (df["intent"].astype(str).str.strip().isin(["", "nan", "None", "NaN"]))

# 2) For only the missing ones, create pseudo-intents from their answers
if missing_mask.any():
    canon = df.loc[missing_mask, "answer_clean"].map(canon_answer)
    # stable mapping only for the missing subset
    mapping = {k:i for i,k in enumerate(sorted(canon.unique()))}
    df.loc[missing_mask, "intent"] = canon.map(lambda x: f"intent_from_answer_{mapping[x]:03d}")

# 3) Finally, force everything to string
df["intent"] = df["intent"].astype(str)

# 4) Safety checks
assert not df["intent"].isna().any(), "There are still NaNs in intent — patch didn’t cover something."
print("✅ Intents repaired. Total classes:", df["intent"].nunique())
print("Sample intents:", list(df["intent"].unique())[:10])
# ----------------------------
# 📊 Cell 8 — 5-Fold Stratified Cross-Validation
# ----------------------------
# What/Why: 
# - Split into 5 equal folds (stratified = keeps intent balance in each fold)
# - Train on 4 folds, test on 1, repeat 5 times
# - Gives stable accuracy/F1 scores for small datasets like your 196-row SpiceJet data
# ----------------------------

from sklearn.model_selection import StratifiedKFold

# Make sure intents are strings (prevents float/string mix errors)
df["intent"] = df["intent"].astype(str)

X_all = df["question_clean"].values
y_all = df["intent"].values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_stats = []
for fold, (tr_idx, te_idx) in enumerate(skf.split(X_all, y_all), start=1):
    X_tr, X_te = X_all[tr_idx], X_all[te_idx]
    y_tr, y_te = y_all[tr_idx], y_all[te_idx]
    
    # Calculate balanced class weights for this fold
    from sklearn.utils.class_weight import compute_class_weight
    classes = np.unique(y_tr)
    class_weights = compute_class_weight("balanced", classes=classes, y=y_tr)
    cw = {c: w for c, w in zip(classes, class_weights)}
    
    # Train the pipeline
    clf_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95)),
        ("clf", LogisticRegression(max_iter=300, C=2.0, class_weight=cw))
    ])
    clf_pipeline.fit(X_tr, y_tr)
    
    # Predict and score
    y_pred = clf_pipeline.predict(X_te)
    acc  = accuracy_score(y_te, y_pred)
    f1_m = f1_score(y_te, y_pred, average="macro")
    f1_w = f1_score(y_te, y_pred, average="weighted")
    
    fold_stats.append({
        "fold": fold,
        "accuracy": acc,
        "f1_macro": f1_m,
        "f1_weighted": f1_w
    })
    
    print(f"[Fold {fold}]  Acc={acc:.3f}  Macro-F1={f1_m:.3f}  Weighted-F1={f1_w:.3f}")

# Summarize across folds
cv_summary = {
    "accuracy_mean":  np.mean([d["accuracy"] for d in fold_stats]),
    "accuracy_std":   np.std([d["accuracy"]  for d in fold_stats]),
    "f1_macro_mean":  np.mean([d["f1_macro"] for d in fold_stats]),
    "f1_macro_std":   np.std([d["f1_macro"]  for d in fold_stats]),
    "f1_weighted_mean": np.mean([d["f1_weighted"] for d in fold_stats]),
    "f1_weighted_std":  np.std([d["f1_weighted"]  for d in fold_stats]),
    "per_fold": fold_stats
}
print("\nCV Summary:", json.dumps(cv_summary, indent=2))

# Save for later
with open(os.path.join(ARTIFACT_DIR, "cv_summary.json"), "w") as f:
    json.dump(cv_summary, f, indent=2)
print("✅ CV summary saved to artifacts/cv_summary.json")


✅ Intents repaired. Total classes: 96
Sample intents: ['general_faq', 'booking_faq', 'specialAssistance_faq', 'intent_from_answer_020', 'intent_from_answer_019', 'intent_from_answer_022', 'intent_from_answer_030', 'intent_from_answer_041', 'intent_from_answer_086', 'intent_from_answer_092']
[Fold 1]  Acc=0.410  Macro-F1=0.147  Weighted-F1=0.458
[Fold 2]  Acc=0.385  Macro-F1=0.127  Weighted-F1=0.427
[Fold 3]  Acc=0.410  Macro-F1=0.177  Weighted-F1=0.451
[Fold 4]  Acc=0.308  Macro-F1=0.131  Weighted-F1=0.370
[Fold 5]  Acc=0.282  Macro-F1=0.126  Weighted-F1=0.350

CV Summary: {
  "accuracy_mean": 0.35897435897435903,
  "accuracy_std": 0.05378506913693084,
  "f1_macro_mean": 0.1416353562479818,
  "f1_macro_std": 0.01928764179596957,
  "f1_weighted_mean": 0.4113547588660711,
  "f1_weighted_std": 0.04342863473040766,
  "per_fold": [
    {
      "fold": 1,
      "accuracy": 0.41025641025641024,
      "f1_macro": 0.14679487179487177,
      "f1_weighted": 0.45825115055884286
    },
    {
      

In [11]:
from sklearn.utils.class_weight import compute_class_weight

y_all = df["intent"].values
classes = np.unique(y_all)
class_weights = compute_class_weight("balanced", classes=classes, y=y_all)
cw = {c:w for c,w in zip(classes, class_weights)}

clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95)),
    ("clf",   LogisticRegression(max_iter=300, C=2.0, class_weight=cw))
])

print("✅ Pipeline defined")


✅ Pipeline defined


In [12]:
# ----------------------------
# 📊 Cell 8 — 5-Fold Stratified Cross-Validation
# ----------------------------
# What/Why: 
# - Split into 5 equal folds (stratified = keeps intent balance in each fold)
# - Train on 4 folds, test on 1, repeat 5 times
# - Gives stable accuracy/F1 scores for small datasets like your 196-row SpiceJet data
# ----------------------------

from sklearn.model_selection import StratifiedKFold

# Make sure intents are strings (prevents float/string mix errors)
df["intent"] = df["intent"].astype(str)

X_all = df["question_clean"].values
y_all = df["intent"].values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_stats = []
for fold, (tr_idx, te_idx) in enumerate(skf.split(X_all, y_all), start=1):
    X_tr, X_te = X_all[tr_idx], X_all[te_idx]
    y_tr, y_te = y_all[tr_idx], y_all[te_idx]
    
    # Calculate balanced class weights for this fold
    from sklearn.utils.class_weight import compute_class_weight
    classes = np.unique(y_tr)
    class_weights = compute_class_weight("balanced", classes=classes, y=y_tr)
    cw = {c: w for c, w in zip(classes, class_weights)}
    
    # Train the pipeline
    clf_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95)),
        ("clf", LogisticRegression(max_iter=300, C=2.0, class_weight=cw))
    ])
    clf_pipeline.fit(X_tr, y_tr)
    
    # Predict and score
    y_pred = clf_pipeline.predict(X_te)
    acc  = accuracy_score(y_te, y_pred)
    f1_m = f1_score(y_te, y_pred, average="macro")
    f1_w = f1_score(y_te, y_pred, average="weighted")
    
    fold_stats.append({
        "fold": fold,
        "accuracy": acc,
        "f1_macro": f1_m,
        "f1_weighted": f1_w
    })
    
    print(f"[Fold {fold}]  Acc={acc:.3f}  Macro-F1={f1_m:.3f}  Weighted-F1={f1_w:.3f}")

# Summarize across folds
cv_summary = {
    "accuracy_mean":  np.mean([d["accuracy"] for d in fold_stats]),
    "accuracy_std":   np.std([d["accuracy"]  for d in fold_stats]),
    "f1_macro_mean":  np.mean([d["f1_macro"] for d in fold_stats]),
    "f1_macro_std":   np.std([d["f1_macro"]  for d in fold_stats]),
    "f1_weighted_mean": np.mean([d["f1_weighted"] for d in fold_stats]),
    "f1_weighted_std":  np.std([d["f1_weighted"]  for d in fold_stats]),
    "per_fold": fold_stats
}
print("\nCV Summary:", json.dumps(cv_summary, indent=2))

# Save for later
with open(os.path.join(ARTIFACT_DIR, "cv_summary.json"), "w") as f:
    json.dump(cv_summary, f, indent=2)
print("✅ CV summary saved to artifacts/cv_summary.json")


[Fold 1]  Acc=0.410  Macro-F1=0.147  Weighted-F1=0.458
[Fold 2]  Acc=0.385  Macro-F1=0.127  Weighted-F1=0.427
[Fold 3]  Acc=0.410  Macro-F1=0.177  Weighted-F1=0.451
[Fold 4]  Acc=0.308  Macro-F1=0.131  Weighted-F1=0.370
[Fold 5]  Acc=0.282  Macro-F1=0.126  Weighted-F1=0.350

CV Summary: {
  "accuracy_mean": 0.35897435897435903,
  "accuracy_std": 0.05378506913693084,
  "f1_macro_mean": 0.1416353562479818,
  "f1_macro_std": 0.01928764179596957,
  "f1_weighted_mean": 0.4113547588660711,
  "f1_weighted_std": 0.04342863473040766,
  "per_fold": [
    {
      "fold": 1,
      "accuracy": 0.41025641025641024,
      "f1_macro": 0.14679487179487177,
      "f1_weighted": 0.45825115055884286
    },
    {
      "fold": 2,
      "accuracy": 0.38461538461538464,
      "f1_macro": 0.12745098039215688,
      "f1_weighted": 0.426596279537456
    },
    {
      "fold": 3,
      "accuracy": 0.41025641025641024,
      "f1_macro": 0.1772357723577236,
      "f1_weighted": 0.45128205128205134
    },
    {
   

In [13]:
# ----------------------------
# ✅ Cell 9 — Train FINAL model on ALL data
# ----------------------------
# What/Why:
# - After 5-fold CV, we train one last time on the entire dataset (no holdout)
# - Uses balanced class weights so rare intents matter
# - Produces `clf_pipeline` that later cells (retrieval/router/saving) will use
# ----------------------------

from sklearn.utils.class_weight import compute_class_weight

# Safety: keep intents as strings
df["intent"] = df["intent"].astype(str)

X_all = df["question_clean"].values
y_all = df["intent"].values

# Balanced class weights on full data
classes_full = np.unique(y_all)
class_weights_full = compute_class_weight(class_weight="balanced",
                                          classes=classes_full, y=y_all)
cw_full = {c:w for c, w in zip(classes_full, class_weights_full)}

# Define & fit the final pipeline
clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95)),
    ("clf",   LogisticRegression(max_iter=300, C=2.0, class_weight=cw_full))
])

clf_pipeline.fit(X_all, y_all)

print("✅ Final model trained on ALL data.")
print("• Classes learned:", len(clf_pipeline.named_steps['clf'].classes_))
print("• Example check ->",
      clf_pipeline.predict(["how much luggage can I carry?"])[0])


✅ Final model trained on ALL data.
• Classes learned: 96
• Example check -> intent_from_answer_037


In [20]:
# --- Cell 10 (Corrected) ---
# Char n-gram retrieval for better matching on small queries, typos, and variants

import re
def _norm(s):
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s./%₹$-]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# Use cleaned questions if present
KB_Q = df["question_clean"].tolist() if "question_clean" in df.columns else [_norm(x) for x in df["question"].astype(str)]
KB_A = df["answer"].tolist()
KB_I = df["intent"].astype(str).tolist()

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 🔁 Char n-grams (3,5) for robust matching
retr_vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
KB_TFIDF = retr_vectorizer.fit_transform(KB_Q)

def cosine_topk(query: str, k: int = 3):
    qv = retr_vectorizer.transform([_norm(query)])
    sims = (qv @ KB_TFIDF.T).toarray().ravel()
    top = np.argsort(-sims)[:k]
    return sims[top], top

print(f"✅ Retrieval index ready (char n-grams): KB size={len(KB_Q)}, TFIDF shape={KB_TFIDF.shape}")

# --- Helper: retrieve_best ---
def retrieve_best(query: str, k: int = 3):
    sims, top_idx = cosine_topk(query, k)
    hits = []
    for idx, score in zip(top_idx, sims):
        hits.append({
            "question": KB_Q[idx],
            "answer": KB_A[idx],
            "intent": KB_I[idx],
            "score": float(score)
        })
    # Return: best_answer, best_intent, best_score, full_hits_list
    return hits[0]["answer"], hits[0]["intent"], sims[0], hits



✅ Retrieval index ready (char n-grams): KB size=195, TFIDF shape=(195, 3443)


In [21]:
# --- Cell 11 (Corrected) ---
INTENT_THRESHOLD = 0.35   # lowered from 0.50
RETR_THRESHOLD   = 0.25   # lowered from 0.35
TOPK_DEFAULT     = 3

import re

def route_query(question: str,
                intent_threshold: float = INTENT_THRESHOLD,
                retrieval_threshold: float = RETR_THRESHOLD,
                method: str = "hybrid",
                topk: int = TOPK_DEFAULT):

    q = (question or "").strip().lower()
    route, answer, pred_intent_for_eval = "abstain", None, None

    # 1) Intent
    try:
        intent_label, intent_score = predict_intent(q)
    except Exception:
        intent_label, intent_score = None, 0.0

    # 2) Retrieval
    try:
        best_ans, best_int, retrieval_score, ret_hits = retrieve_best(q, k=topk)
    except Exception:
        best_ans, best_int, retrieval_score, ret_hits = None, None, 0.0, []

    # Helper: keyword overlap
    def keyword_overlap(qtext, kbtext):
        stop = {"the","a","an","is","are","am","for","to","of","in","on","with","and","or",
                "can","i","me","my","what","how","do","does"}
        q_words  = [w for w in re.findall(r"[a-z0-9]+", qtext) if w not in stop]
        kb_words = [w for w in re.findall(r"[a-z0-9]+", kbtext) if w not in stop]
        return len(set(q_words) & set(kb_words))

    # Synonym expansion
    def expand_query(qtext: str) -> str:
        expansions = {
            "discount": ["offer","promotion","promo","deal","concession","rebate","sale","fare cut"],
            "discounts": ["offers","promotions","deals"],
            "baggage": ["luggage","bag","bags"],
            "luggage": ["baggage","bag","bags"],
            "extra": ["additional","excess"],
        }
        words = qtext.split()
        extra = []
        for w in words:
            extra += expansions.get(w, [])
        return (qtext + " " + " ".join(extra)).strip()

    # Expanded retrieval
    expanded_q = expand_query(q)
    if expanded_q != q:
        _, _, expanded_score, expanded_hits = retrieve_best(expanded_q, k=topk)
    else:
        expanded_score, expanded_hits = retrieval_score, ret_hits

    # 3) Routing
    if method == "intent":
        if intent_score >= intent_threshold:
            route = "intent"
            pred_intent_for_eval = intent_label
            subset = df[df["intent"] == intent_label]
            answer = subset["answer"].mode().iloc[0] if len(subset) else None

    elif method == "retrieval":
        if expanded_score >= retrieval_threshold:
            route = "retrieval"
            pred_intent_for_eval = expanded_hits[0]["intent"] if expanded_hits else best_int
            answer = expanded_hits[0]["answer"] if expanded_hits else best_ans

    else:  # hybrid
        if intent_score >= intent_threshold and intent_score >= expanded_score:
            route = "intent"
            pred_intent_for_eval = intent_label
            subset = df[df["intent"] == intent_label]
            answer = subset["answer"].mode().iloc[0] if len(subset) else None
        elif expanded_score >= retrieval_threshold:
            route = "retrieval"
            pred_intent_for_eval = expanded_hits[0]["intent"] if expanded_hits else best_int
            answer = expanded_hits[0]["answer"] if expanded_hits else best_ans
        else:
            # Low-confidence fallback
            top1 = expanded_hits[0] if expanded_hits else (ret_hits[0] if ret_hits else None)
            candidate = top1 or {}
            candidate_q = candidate.get("question", "")
            overlap = keyword_overlap(q, candidate_q)
            weak_score = candidate.get("score", 0.0)

            if overlap >= 1 and weak_score >= 0.15:
                route = "retrieval_low_conf"
                pred_intent_for_eval = candidate.get("intent")
                answer = "(Low confidence) " + candidate.get("answer", "")

    confidence = float(max(intent_score, expanded_score))
    return {
        "question": question,
        "route": route,
        "intent": {"label": intent_label, "score": float(intent_score)},
        "retrieval": {"best_score": float(expanded_score), "topk": expanded_hits},
        "pred_intent_for_eval": pred_intent_for_eval,
        "answer": answer,
        "confidence": confidence
    }


In [22]:
# ----------------------------
# 💾 Cell 12 — Save Artifacts
# ----------------------------
# What/Why:
# - Persist final model & assets to ./artifacts so you can load them next time
# - Includes: classifier pipeline, retrieval vectorizer, KB (Q/A/intent), metadata
# ----------------------------

import json
from joblib import dump

os.makedirs(ARTIFACT_DIR, exist_ok=True)

# 1) Final intent classifier pipeline
clf_path = os.path.join(ARTIFACT_DIR, "intent_clf_tfidf_lr.joblib")
dump(clf_pipeline, clf_path)

# 2) Retrieval vectorizer (fit on ALL KB_Q)
retr_path = os.path.join(ARTIFACT_DIR, "retr_vectorizer.joblib")
dump(retr_vectorizer, retr_path)

# 3) Knowledge base (original answers, cleaned questions, intents)
kb_obj = {"KB_Q": KB_Q, "KB_A": KB_A, "KB_I": KB_I}
kb_path = os.path.join(ARTIFACT_DIR, "kb_qa.joblib")
dump(kb_obj, kb_path)

# 4) Metadata: thresholds, classes, counts
meta = {
    "intent_threshold": float(INTENT_THRESHOLD),
    "retrieval_threshold": float(RETR_THRESHOLD),
    "topk_default": int(TOPK_DEFAULT),
    "n_rows": int(len(df)),
    "n_classes": int(df["intent"].nunique()),
    "classes": list(clf_pipeline.named_steps["clf"].classes_)
}
meta_path = os.path.join(ARTIFACT_DIR, "meta.json")
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

# 5) (Optional) Keep a copy of the working dataframe for reference
df.to_csv(os.path.join(ARTIFACT_DIR, "working_full.csv"), index=False, encoding="utf-8-sig")

print("✅ Saved:")
print(" •", clf_path)
print(" •", retr_path)
print(" •", kb_path)
print(" •", meta_path)
print(" •", os.path.join(ARTIFACT_DIR, 'working_full.csv'))


✅ Saved:
 • ./artifacts/intent_clf_tfidf_lr.joblib
 • ./artifacts/retr_vectorizer.joblib
 • ./artifacts/kb_qa.joblib
 • ./artifacts/meta.json
 • ./artifacts/working_full.csv


In [23]:
# ----------------------------
# 📦 Cell 13 — Reload Artifacts
# ----------------------------
# What/Why:
# - Bring back the saved model, vectorizer, and KB from ./artifacts
# - Recreate the KB_TFIDF matrix for retrieval
# - Restore thresholds from meta.json
# ----------------------------

import json
from joblib import load

def load_artifacts(path=ARTIFACT_DIR):
    clf = load(os.path.join(path, "intent_clf_tfidf_lr.joblib"))
    retr_vec = load(os.path.join(path, "retr_vectorizer.joblib"))
    kb = load(os.path.join(path, "kb_qa.joblib"))  # {"KB_Q", "KB_A", "KB_I"}
    with open(os.path.join(path, "meta.json"), "r", encoding="utf-8") as f:
        meta = json.load(f)
    return clf, retr_vec, kb, meta

# Example: run this at the top of a NEW session (or now to verify)
clf_pipeline, retr_vectorizer, kb_loaded, meta = load_artifacts()

# Restore globals used by router/retrieval for convenience
KB_Q = kb_loaded["KB_Q"]
KB_A = kb_loaded["KB_A"]
KB_I = kb_loaded["KB_I"]
KB_TFIDF = retr_vectorizer.fit_transform(KB_Q)  # safe: fits identically on same KB_Q

# Restore thresholds (you can still override later)
INTENT_THRESHOLD = float(meta.get("intent_threshold", 0.50))
RETR_THRESHOLD   = float(meta.get("retrieval_threshold", 0.35))
TOPK_DEFAULT     = int(meta.get("topk_default", 3))

print("✅ Reloaded artifacts.")
print("• Rows:", meta.get("n_rows"), "• Classes:", meta.get("n_classes"))
print("• Thresholds:", {"intent": INTENT_THRESHOLD, "retrieval": RETR_THRESHOLD, "topk": TOPK_DEFAULT})


✅ Reloaded artifacts.
• Rows: 195 • Classes: 96
• Thresholds: {'intent': 0.35, 'retrieval': 0.25, 'topk': 3}


In [24]:
# ----------------------------
# 💬 Cell 14 — Chat Loop + Self-Check
# ----------------------------
# What/Why:
# - `self_check()` confirms classifier + retrieval + router run end-to-end
# - `chat()` lets you talk to the bot in the notebook
# ----------------------------

def self_check():
    issues = []
    try:
        _ = clf_pipeline.predict(["test"])
    except Exception as e:
        issues.append(("Classifier", str(e)))
    try:
        _ = retr_vectorizer.transform(["test"])
        _ = KB_TFIDF.shape
    except Exception as e:
        issues.append(("Retrieval index", str(e)))
    try:
        _ = route_query("test question", method="hybrid", topk=3)
    except Exception as e:
        issues.append(("Router", str(e)))
    return "OK ✅" if not issues else issues

print("Self-check:", self_check())

def chat(intent_threshold=INTENT_THRESHOLD, retrieval_threshold=RETR_THRESHOLD, method="hybrid", topk=TOPK_DEFAULT):
    print("Type 'exit' to quit.")
    while True:
        try:
            q = input("You: ").strip()
        except EOFError:
            break
        if not q or q.lower() in ("exit","quit","q"):
            break
        out = route_query(q, intent_threshold, retrieval_threshold, method, topk)
        print("\n---")
        print("Route:", out["route"])
        print("Intent:", out["intent"])
        print("Confidence:", round(out["confidence"], 3))
        print("Answer:", out["answer"] if out["answer"] else "[No confident match — please rephrase]")
        print("Top matches (scores):", [ (h['score'], h['question']) for h in out['retrieval']['topk'] ])
        print("---\n")

# Run if you want to chat now:
# chat()


Self-check: OK ✅


In [25]:
chat()

Type 'exit' to quit.


You:  can I carry extra luggage?



---
Route: retrieval
Intent: {'label': None, 'score': 0.0}
Confidence: 0.355
Answer: Airport and pre-booking charges for excess check-in baggage are listedhere.
Top matches (scores): [(0.35521286328708845, 'what are the charges for excess check-in baggage'), (0.35521286328708845, 'what are the charges for excess check-in baggage'), (0.34880199386570054, 'can excess baggage be pre-purchased')]
---



You:  liquor?



---
Route: retrieval
Intent: {'label': None, 'score': 0.0}
Confidence: 0.795
Answer: Maximum of 5 liters of alcoholic beverages with alcohol content between 24% and 70% by volume is permitted in retail packaging as checked-in baggage.The permissible limit of 5 lt. includes the duty paid alcohol that passenger may buy from the airport. Bills for such purchases must always be kept handy.On a domestic flight, alcoholic beverages are only permitted to be carried as checked-in baggage, unless they are purchased from the duty paid outlets. These duty paid alcoholic beverages can then be carried as cabin baggage.Alcoholic beverages carried on-board by passengers are not permitted to be opened and/or consumed inside an aircraft.Carrying alcoholic beverages from/into the states of Bihar, Gujarat, Tripura, Mizoram, Nagaland, UT Lakshadweep or any state considered as a ‘dry state’ is not permitted.
Top matches (scores): [(0.7951537859970181, 'can i carry my liquor'), (0.7951537859970181, 'can i 

You:  travel certificate?



---
Route: retrieval
Intent: {'label': None, 'score': 0.0}
Confidence: 0.941
Answer: A travel certificate is a document that serves as proof of travel that you have completed. This document can be helpful in regards to reimbursements, LTC and GST claims, etc.
Top matches (scores): [(0.9406339030935356, 'what is a travel certificate'), (0.9406339030935356, 'what is a travel certificate'), (0.9406339030935356, 'what is a travel certificate')]
---



You:  food?



---
Route: retrieval
Intent: {'label': None, 'score': 0.0}
Confidence: 0.376
Answer: Variety of hot meals, sandwiches, chef's choices, and special meals such as meals for diabetics, jain meal, low-calorie salads etc., are available for pre-booking at discounted rates. In addition to these, passengers can also pre-book Cake (Mocha, 500 g) to surprise their loved ones. All meals can be pre-booked up to 6 hours prior to departure, apart from special meals and cake which are available for pre-booking up to 24 hrs prior and 48 hours prior to departure, respectively.Our on-board menu offers a variety of ready-to-eat products, munchies and beverages to choose from.The images of F&B products featured across our platforms, including packaging, are for illustrative purposes only.
Top matches (scores): [(0.3764424282478896, 'does spicejet provide any food or drinks onboard'), (0.3764424282478896, 'does spicejet provide any food or drinks onboard'), (0.01815058936035957, 'can i get a refund for t

You:  bye



---
Route: abstain
Intent: {'label': None, 'score': 0.0}
Confidence: 0.111
Answer: [No confident match — please rephrase]
Top matches (scores): [(0.11129293183050866, 'which credit/debit cards are accepted by spicejet for payment'), (0.10030067881232725, 'i have recently recovered from covid-19. can i travel by air'), (0.0, 'what is it like to fly with spicejet')]
---



You:  exit


In [28]:
# ----------------------------
# ✅ Bot accuracy on your current dataset (single cell)
# - Uses the router's predicted intent label (or "abstain")
# - Works with the globals you already have: INTENT_THRESHOLD, RETR_THRESHOLD, TOPK_DEFAULT
# ----------------------------
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 1) Inputs
X_test = df["question_clean"].astype(str).tolist()
y_true = df["intent"].astype(str).tolist()

# 2) Predict with the actual bot logic
y_pred = []
for q in X_test:
    out = route_query(
        q,
        intent_threshold=INTENT_THRESHOLD,
        retrieval_threshold=RETR_THRESHOLD,
        method="hybrid",
        topk=TOPK_DEFAULT
    )
    # Use the router's final intent choice for evaluation
    # (falls back to "abstain" if the bot abstains)
    pred_label = out.get("pred_intent_for_eval") or "abstain"
    y_pred.append(str(pred_label))

# 3) Metrics (treat "abstain" as a class so it's counted)
labels_full = sorted(list(set(y_true) | {"abstain"}))

acc   = accuracy_score(y_true, y_pred)
f1_m  = f1_score(y_true, y_pred, average="macro", labels=labels_full)
f1_w  = f1_score(y_true, y_pred, average="weighted", labels=labels_full)

print(f"Accuracy:   {acc:.3f}")
print(f"Macro F1:   {f1_m:.3f}")
print(f"Weighted F1:{f1_w:.3f}\n")

print("Per-class report:")
print(classification_report(y_true, y_pred, labels=labels_full, zero_division=0))


Accuracy:   0.872
Macro F1:   0.695
Weighted F1:0.839

Per-class report:
                        precision    recall  f1-score   support

               abstain       0.00      0.00      0.00         0
           booking_faq       0.98      0.95      0.96        43
           general_faq       1.00      0.97      0.98        31
intent_from_answer_000       0.50      1.00      0.67         2
intent_from_answer_001       1.00      1.00      1.00         1
intent_from_answer_002       0.00      0.00      0.00         1
intent_from_answer_003       1.00      1.00      1.00         1
intent_from_answer_004       1.00      1.00      1.00         1
intent_from_answer_005       0.00      0.00      0.00         1
intent_from_answer_006       0.50      1.00      0.67         1
intent_from_answer_007       1.00      1.00      1.00         1
intent_from_answer_008       1.00      1.00      1.00         1
intent_from_answer_009       1.00      1.00      1.00         1
intent_from_answer_010       1