# Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from xgboost import XGBClassifier

# Load data

In [None]:
df = pd.read_csv(r"C:\Users\leege\Documents\Capstone\elderly_topical_conversational_sentences.csv")

df['label'].value_counts()

# Feature Extraction

In [None]:
import pandas as pd
import re

HEALTHCARE_KEYWORDS = [
    "clinic", "polyclinic", "hospital", "doctor", "nurse", "medicine",
    "tablet", "capsule", "injection", "pain", "ache", "symptom",
    "treatment", "checkup", "appointment", "blood pressure", "diabetes",
    "cholesterol", "physiotherapy", "rehabilitation", "pharmacy", "scan",
    "surgery", "therapy", "consultation", "vaccination", "prescription"
]

LONGTERM_KEYWORDS = [
    "grandson", "granddaughter", "children", "family", "siblings", "parents",
    "hobby", "gardening", "reading", "knitting", "painting", "cooking",
    "house", "flat", "HDB", "condo", "car", "pet", "cat", "dog",
    "garden", "relatives", "friends", "community", "club", "volunteer",
    "retirement", "savings", "pension"
]

SHORTTERM_KEYWORDS = [
    "today", "tomorrow", "later", "now", "yesterday", "tonight",
    "MRT", "bus", "train", "hawker", "hawker centre", "kopitiam",
    "wet market", "supermarket", "shop", "shopping", "meal", "breakfast",
    "lunch", "dinner", "snack", "queue", "appointment", "errand", "weather",
    "rain", "sunny", "hot", "cold", "event", "celebration", "CNY", "angbao",
    "getai", "festival", "promotion", "sale", "traffic", "delay", "jam"
]

def count_category_words(text, category_words):
    """Count how many words in `text` match `category_words` (case-insensitive)."""
    words = re.findall(r'\b\w+\b', text.lower())
    return sum(1 for w in words if w in category_words)

def add_category_word_counts(df):
    df = df.copy()  # Avoid modifying original df
    df['healthcare_count'] = df['text'].apply(lambda x: count_category_words(x, HEALTHCARE_KEYWORDS))
    df['longterm_count'] = df['text'].apply(lambda x: count_category_words(x, LONGTERM_KEYWORDS))
    df['shortterm_count'] = df['text'].apply(lambda x: count_category_words(x, SHORTTERM_KEYWORDS))
    return df

df = add_category_word_counts(df)
df

In [None]:
# Feature Extraction with tf-idf
X_text = df['text'].values
y_text = df['label'].values

# Encode text labels to integers
le = LabelEncoder()
y = le.fit_transform(y_text)
print("Label mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

# TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(X_text).toarray()

# SBERT embeddings
# all-MiniLM-L6-v2 or all-mpnet-base-v2
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_sbert = sbert_model.encode(X_text, show_progress_bar=True)

# Categorical features
category_features = df[['healthcare_count', 'longterm_count', 'shortterm_count']].values

# Combine all features
X_hybrid = np.hstack([X_tfidf, X_sbert, category_features])
print("Hybrid feature shape:", X_hybrid.shape)

# Training

In [None]:
# Train/Validation/Test Split
X_trainval, X_test, y_trainval, y_test = train_test_split(X_hybrid, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

xgb_model = XGBClassifier(
    n_estimators=427,
    max_depth=9,
    learning_rate=0.033,
    min_child_weight=1,
    gamma=0.179,
    subsample=0.748,
    colsample_bytree=0.63,
    reg_alpha=0.225,
    reg_lambda=0.69,
    eval_metric='mlogloss',  # multi-class logloss
    objective='multi:softprob',  # multi-class
    num_class=3,
    random_state=42
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)


# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Predict on test set ---
y_pred = xgb_model.predict(X_test)

# --- Accuracy ---
print("Test accuracy:", accuracy_score(y_test, y_pred))

# --- Classification report with original labels ---
print("\nClassification Report:\n", 
      classification_report(y_test, y_pred, target_names=le.classes_))

# --- Confusion matrix ---
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# Optional: normalized confusion matrix
cm_norm = confusion_matrix(y_test, y_pred, normalize='true')
print("\nNormalized Confusion Matrix:\n", cm_norm)


In [None]:
import numpy as np
import re

# Define category keywords (should match training)
HEALTHCARE_KEYWORDS = [
    "clinic","polyclinic","hospital","doctor","nurse","medicine",
    "tablet","capsule","injection","pain","ache","symptom",
    "treatment","checkup","appointment","blood pressure","diabetes",
    "cholesterol","physiotherapy","rehabilitation","pharmacy","scan",
    "surgery","therapy","consultation","vaccination","prescription"
]

LONGTERM_KEYWORDS = [
    "grandson","granddaughter","children","family","siblings","parents",
    "hobby","gardening","reading","knitting","painting","cooking",
    "house","flat","HDB","condo","car","pet","cat","dog",
    "garden","relatives","friends","community","club","volunteer",
    "retirement","savings","pension"
]

SHORTTERM_KEYWORDS = [
    "today","tomorrow","tonight","now","yesterday","morning","afternoon","evening",
    "MRT","bus","train","hawker","hawker centre","kopitiam","wet market",
    "supermarket","shop","shopping","meal","breakfast","lunch","dinner","snack",
    "queue","appointment","errand","weather","rain","sunny","hot","cold","event",
    "celebration","CNY","angbao","getai","festival","promotion","sale","traffic",
    "delay","jam"
]

def count_category_words(text, category_words):
    words = re.findall(r'\b\w+\b', text.lower())
    return sum(1 for w in words if w in category_words)
 
def predict_text(new_texts, le):
    """
    Input: 
        new_texts: list of strings
        le: trained LabelEncoder
    Output: predicted labels as original text (e.g., 'healthcare', 'long-term', 'short-term')
    """
    
    # TF-IDF and SBERT
    X_tfidf_new = tfidf_vectorizer.transform(new_texts).toarray()
    X_sbert_new = sbert_model.encode(new_texts, show_progress_bar=False)
    X_nlp_new = np.array([extract_simple_nlp_features(t) for t in new_texts])
    
    # Category word counts
    category_features_new = np.array([
        [
            count_category_words(t, HEALTHCARE_KEYWORDS),
            count_category_words(t, LONGTERM_KEYWORDS),
            count_category_words(t, SHORTTERM_KEYWORDS)
        ]
        for t in new_texts
    ])
    
    # Combine all features
    X_new_hybrid = np.hstack([X_tfidf_new, X_sbert_new, category_features_new])
    
    # Predict integers
    preds_int = xgb_model.predict(X_new_hybrid)
    
    # Convert back to original labels
    preds_text = le.inverse_transform(preds_int)
    return preds_text

# Example usage
predict_text(["Forget eat medicine again"],le)

# Save model (Change directory accordingly)

In [None]:
# Save models
with open("topic_xgb_hybrid_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

with open("topic_tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("topic_sbert_model_name.pkl", "wb") as f:
    pickle.dump('all-MiniLM-L6-v2', f)

with open('topic_label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
    
with open('topic_category_keywords.pkl', 'wb') as f:
    pickle.dump({
        'healthcare': HEALTHCARE_KEYWORDS,
        'longterm': LONGTERM_KEYWORDS,
        'shortterm': SHORTTERM_KEYWORDS
    }, f)

print("Models and artifacts saved.")


# Randomized Search - Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

param_dist = {
    'n_estimators': randint(100, 600),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'min_child_weight': randint(1, 6),
    'gamma': uniform(0, 0.5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0.5, 2)
}

xgb_model = XGBClassifier(
    objective='multi:softprob',  # multi-class
    num_class=3,
    eval_metric='mlogloss',
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=15,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit
random_search.fit(X_train, y_train)

# Score
print("Best parameters found:", random_search.best_params_)
print("Best CV accuracy:", random_search.best_score_)

# Eval
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))