# Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from xgboost import XGBClassifier

# Load data

In [None]:
df = pd.read_csv(r"C:\Users\leege\Documents\Capstone\elderly_conversational_sentences.csv")
df['binary_label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)

# Remove '?' to generalize model
question_idx = df[df['text'].str.endswith('?')].index
remove_idx = np.random.choice(question_idx, size=len(question_idx)//2, replace=False)
df.loc[remove_idx, 'text'] = df.loc[remove_idx, 'text'].apply(lambda x: x.rstrip('?'))

df['binary_label'].value_counts()

# Feature Extraction

In [None]:
# Feature Extraction with tf-idf
X_text = df['text'].values
y = df['binary_label'].values

# TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(X_text).toarray()

# SBERT embeddings
# all-MiniLM-L6-v2 or all-mpnet-base-v2
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_sbert = sbert_model.encode(X_text, show_progress_bar=True)

# Simple NLP features
question_words = ['who','what','where','when','why','how','which']
def extract_simple_nlp_features(text):
    words = text.lower().split()
    return np.array([
        int(text.lower().endswith('?')),                      
        int(words[0] in question_words if words else 0)
    ])
X_nlp = np.array([extract_simple_nlp_features(t) for t in X_text])

# Combine all features
X_hybrid = np.hstack([X_tfidf, X_sbert, X_nlp])
print("Hybrid feature shape:", X_hybrid.shape)

# Training

In [None]:
# Train/Validation/Test Split
X_trainval, X_test, y_trainval, y_test = train_test_split(X_hybrid, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

# Calculate scale_pos_weight for imbalanced dataset
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

xgb_model = XGBClassifier(
    n_estimators=427,
    max_depth=9,
    learning_rate=0.03317381190502595,
    min_child_weight=1,
    gamma=0.1792328642721363,
    subsample=0.7483273008793065,
    colsample_bytree=0.6296178606936361,
    reg_alpha=0.2247253370691017,
    reg_lambda=0.6908202329808226,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)]
)

# Evaluation

In [None]:
# Evaluation
y_pred = xgb_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# With tf-idf
def predict_text(new_texts):
    """
    Input: list of strings
    Output: predicted binary labels (0=answer, 1=question)
    """

    question_words = ['who','what','where','when','why','how','which']
    def extract_simple_nlp_features(text):
        words = text.lower().split()
        return np.array([
            int(text.lower().endswith('?')),                      
            int(words[0] in question_words if words else 0)
        ])
        
    X_tfidf_new = tfidf_vectorizer.transform(new_texts).toarray()
    X_sbert_new = sbert_model.encode(new_texts, show_progress_bar=False)
    X_nlp_new = np.array([extract_simple_nlp_features(t) for t in new_texts])
    
    X_new_hybrid = np.hstack([X_tfidf_new, X_sbert_new, X_nlp_new])
    preds = xgb_model.predict(X_new_hybrid)
    return preds

predict_text(["When is time now"])
# extract_simple_nlp_features("Where is medicine")

In [None]:
import pandas as pd

# Extended test data with 20 samples
data = {
    "text": [
        "Did you take your kopi from the kopitiam today",
        "I remember when we used to walk to the hawker centre after school.",
        "Can you help me top up my EZ-Link card",
        "My knees have been hurting after walking at the park connector.",
        "What time is the doctor’s appointment at NUH tomorrow",
        "I enjoy watering the plants in my HDB balcony every morning.",
        "Have you seen the news about the MRT delay this morning",
        "I baked some pineapple tarts yesterday for CNY.",
        "Do you know how to set the aircon timer properly",
        "I like listening to old getai songs during the festive season.",
        "Why did the bus arrive late at the bus stop near Bedok?",
        "I feel tired after walking around the wet market.",
        "Could you remind me of your birthday again so I can send angbao",
        "I watched a really interesting documentary on Singapore’s history.",
        "When are we visiting the grandchildren at Changi next",
        "Did you manage to book a slot at the community centre for exercise",
        "I miss the old hawker uncle at our neighbourhood market.",
        "Can you teach me how to use WhatsApp video call",
        "I went for a morning stroll along East Coast Park today.",
        "Have you checked the weather before heading to the pasar malam tonight"
    ],
    "label": [  # 1 = question, 0 = statement
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        1, 0, 1, 0, 1
    ]
}

df_test = pd.DataFrame(data)

# Assuming predict_text function exists
df_test['pred'] = df_test['text'].apply(lambda x: predict_text([x]))
pd.set_option('display.max_colwidth', None)
df_test


# Save model (Change directory accordingly)

In [None]:
# Save models
with open("qa_xgb_hybrid_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

with open("qa_tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("qa_sbert_model_name.pkl", "wb") as f:
    pickle.dump('all-MiniLM-L6-v2', f)

print("Models and artifacts saved.")


# Randomized Search - Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# 1. Parameter grid (you can expand ranges)
param_dist = {
    'n_estimators': randint(100, 600),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'min_child_weight': randint(1, 6),
    'gamma': uniform(0, 0.5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0.5, 2)
}

# 2. Base XGBoost classifier
xgb_model = XGBClassifier(
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# 3. Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=15,  # number of random combinations to try
    scoring='accuracy',
    cv=3,       # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# 4. Fit to training data
random_search.fit(X_train, y_train)

# 5. Best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best CV accuracy:", random_search.best_score_)

# 6. Evaluate on test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
from sklearn.metrics import accuracy_score
print("Test accuracy:", accuracy_score(y_test, y_pred))
