In [20]:
# Export XGBoost model and artifacts for smart triage UI

# 1) Imports and setup
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer
from model_utils import save_model_artifacts
from feature_engineering import TextFeatureExtractor

RANDOM_STATE = 42
OUTPUT_DIR = 'model_artifacts'


In [21]:
# 2) Load processed data and define targets consistent with training

df = pd.read_csv('github_issues_processed.csv')

cat_targets = [
    'is_bug_cat','is_feature_cat','is_doc_cat',
    'is_help_cat','is_priority_cat','is_status_cat'
]
df['category'] = df[cat_targets].idxmax(axis=1)

label_targets = [col for col in df.columns if col.startswith('has_')]

exclude = cat_targets + label_targets + ['n_labels', 'category']
X = df.drop(columns=exclude)
y_cat = df['category']


In [22]:
# 3) Train/test split (same split strategy)
X_train, X_test, y_cat_train, y_cat_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=RANDOM_STATE, stratify=y_cat
)


In [23]:
# 4) Fit vectorizers/encoders exactly as used downstream
# Fit LabelEncoder on training categories (matching training notebook flow)
label_encoder = LabelEncoder()
y_cat_train_encoded = label_encoder.fit_transform(y_cat_train)
y_cat_test_encoded = label_encoder.transform(y_cat_test)

# Fit repository encoder from raw data (processed data only has repo_encoded)
repo_encoder = LabelEncoder()
raw_df = pd.read_csv('github_issues.csv', usecols=['title', 'body', 'repo_name'])
repo_series = raw_df['repo_name'].astype(str).fillna('unknown_repo')
repo_encoder.fit(repo_series)

# Fit TF-IDF on combined text from raw title+body to mirror inference
# For export, we only need the fitted vectorizer to match inference; training here uses processed X.
tfidf = TfidfVectorizer(max_features=250, stop_words='english', ngram_range=(1,2))
combined_text = raw_df[['title', 'body']].fillna('').apply(lambda x: ' '.join(x), axis=1)
tfidf.fit(combined_text)

# Feature extractor (used only for UI/inference, not training here)
feature_extractor = TextFeatureExtractor(tfidf)


In [24]:
# 5) Configure XGBoost exactly as training notebook
xgb_config = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'objective': 'multi:softprob'
}

# Set num_class from training labels
xgb_config['num_class'] = len(np.unique(y_cat_train_encoded))


In [25]:
# 6) Train XGBoost with SMOTE on the processed feature matrix (as in training)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

xgb_pipeline = Pipeline([
    ('smote', SMOTE(random_state=RANDOM_STATE, k_neighbors=2)),
    ('xgb', xgb.XGBClassifier(**xgb_config))
])

xgb_pipeline.fit(X_train, y_cat_train_encoded)

y_pred_encoded = xgb_pipeline.predict(X_test)
y_proba = xgb_pipeline.predict_proba(X_test)

# For sanity: decode for a quick classification report
y_pred = label_encoder.inverse_transform(y_pred_encoded)
print("XGBoost with SMOTE — quick eval on holdout")
print(classification_report(y_cat_test, y_pred, zero_division=0))


XGBoost with SMOTE — quick eval on holdout
                 precision    recall  f1-score   support

     is_bug_cat       0.96      0.99      0.97      2961
     is_doc_cat       0.52      0.35      0.42        79
 is_feature_cat       0.64      0.35      0.45       119
    is_help_cat       0.00      0.00      0.00         3
is_priority_cat       0.00      0.00      0.00         1
  is_status_cat       0.90      1.00      0.95        28

       accuracy                           0.94      3191
      macro avg       0.50      0.45      0.47      3191
   weighted avg       0.93      0.94      0.94      3191



In [26]:
# 7) Persist model and artifacts for UI inference
# We save the trained pipeline's XGB classifier, the TF-IDF vectorizer, the label encoder, and the repo encoder.

# Extract the trained XGB model from pipeline for saving
trained_model = xgb_pipeline.named_steps['xgb']

save_model_artifacts(
    model=trained_model,
    tfidf_vectorizer=tfidf,
    label_encoder=label_encoder,
    repo_encoder=repo_encoder,
    output_dir=OUTPUT_DIR
)

print(f"Saved artifacts to: {OUTPUT_DIR}")


Saved artifacts to: model_artifacts


In [27]:
# 8) Reload modules to pick up updated feature schema
import importlib, feature_engineering, smart_triage
importlib.reload(feature_engineering)
importlib.reload(smart_triage)
from smart_triage import SmartIssueTriage


In [28]:
# 9) Quick verification snippet for UI-like prediction with reloaded modules
triage = SmartIssueTriage(model_dir=OUTPUT_DIR)

sample_title = "Error in login flow: users cannot reset password"
sample_body = "Users report password reset links error out with 500. Happens since v2.3."
sample_repo = repo_encoder.classes_[0]

features = triage.feature_extractor.extract_all_features(
    text=f"{sample_title}\n{sample_body}",
    repo=sample_repo,
    repo_encoder=repo_encoder
)

# Debug: print feature names vs model booster names if mismatch persists
model_feature_names = triage.model.get_booster().feature_names
first_10 = features.columns.tolist()[:10]
print("Model features (count):", len(model_feature_names))
print("Input features (count):", features.shape[1])
print("First 10 input cols:", first_10)

result = triage.predict(
    title=sample_title,
    body=sample_body,
    repo=sample_repo,
    threshold=0.35
)

print(json.dumps(result, indent=2)[:2000])


Model features (count): 659
Input features (count): 659
First 10 input cols: ['created_hour', 'created_day_of_week', 'created_month', 'n_days_to_resolution', 'title_length', 'body_length', 'title_word_count', 'body_word_count', 'code_block_count', 'url_count']
{
  "primary_category": {
    "category": "is_bug_cat",
    "confidence": 0.9812637567520142,
    "action_needed": true
  },
  "secondary_suggestions": [],
  "triage_recommendations": [
    {
      "type": "high_confidence_bug",
      "message": "High confidence bug report - Immediate review recommended",
      "priority": "high"
    }
  ],
  "repo_context": {
    "repository": "2dust/v2rayN",
    "typical_response_time": "2-3 days",
    "similar_issues_count": 5
  }
}


In [29]:
# 8) Quick verification snippet for UI-like prediction
from smart_triage import SmartIssueTriage

triage = SmartIssueTriage(model_dir=OUTPUT_DIR)

sample_title = "Error in login flow: users cannot reset password"
sample_body = "Users report password reset links error out with 500. Happens since v2.3."
# pick a repo from the fitted encoder classes for a valid mapping
sample_repo = repo_encoder.classes_[0]

result = triage.predict(
    title=sample_title,
    body=sample_body,
    repo=sample_repo,
    threshold=0.35
)

print(json.dumps(result, indent=2)[:2000])


{
  "primary_category": {
    "category": "is_bug_cat",
    "confidence": 0.9812637567520142,
    "action_needed": true
  },
  "secondary_suggestions": [],
  "triage_recommendations": [
    {
      "type": "high_confidence_bug",
      "message": "High confidence bug report - Immediate review recommended",
      "priority": "high"
    }
  ],
  "repo_context": {
    "repository": "2dust/v2rayN",
    "typical_response_time": "2-3 days",
    "similar_issues_count": 5
  }
}
