# Model Selection

Leveraging the dataset found in github_issues_processed.csv we are going to train an ML model capable of making predictions for issue category as well as issue labels.

This is a step towards a predictive experience for users that are submitting a GitHub issue. after the user has update the title and body of the issue report, issue category will be predicted and recommended with a confidence score. the user will also be presented with up to 3 label suggestions for the issue, each presented with a confidence score.

### Import and Setup

In [16]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("github_issues_processed.csv")
# df.head(1).to_json('github_issues_processed.json')

### Define targets, drop source and helper features

In [None]:
# Category target
cat_targets = [
    "is_bug_cat","is_feature_cat","is_doc_cat",
    "is_help_cat","is_priority_cat","is_status_cat"
]
df["category"] = df[cat_targets].idxmax(axis=1)

# Label targets
label_targets = [
    col for col in df.columns if col.startswith("has_")
]

# Define features, exlude targets and helper features
exclude = cat_targets + label_targets + ["n_labels", "category"]
X = df.drop(columns=exclude)
y_cat = df["category"]
y_labels = df[label_targets]

### Train/test split

In [None]:
X_train, X_test, y_cat_train, y_cat_test, y_labels_train, y_labels_test = train_test_split(
    X, y_cat, y_labels, test_size=0.2, random_state=42, stratify=y_cat)

### Define base configurations for each model

In [None]:
rf_config = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'random_state': 42,
    'n_jobs': -1,
    'class_weight': 'balanced_subsample'
}

gb_config = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'random_state': 42
}

xgb_config = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1,
    'objective': 'multi:softprob'
}


### Create and evaluate RandomForest pipeline

In [None]:
rf_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),
    ('rf', RandomForestClassifier(**rf_config))
])

# Fit the model
rf_pipeline.fit(X_train, y_cat_train)

# Get predictions and probabilities
y_pred_rf = rf_pipeline.predict(X_test)
y_proba_rf = rf_pipeline.predict_proba(X_test)

# Print detailed evaluation
print("Random Forest with SMOTE")
print("\nClassification Report:")
print(classification_report(y_cat_test, y_pred_rf, zero_division=0))

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_pipeline.named_steps['rf'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Random Forest with SMOTE

Classification Report:
                 precision    recall  f1-score   support

     is_bug_cat       0.97      0.96      0.96      2961
     is_doc_cat       0.36      0.44      0.40        79
 is_feature_cat       0.47      0.47      0.47       119
    is_help_cat       0.00      0.00      0.00         3
is_priority_cat       0.00      0.00      0.00         1
  is_status_cat       0.69      0.71      0.70        28

       accuracy                           0.93      3191
      macro avg       0.41      0.43      0.42      3191
   weighted avg       0.93      0.93      0.93      3191


Top 10 Most Important Features:
          feature  importance
94       tfidf_69    0.024419
24   repo_encoded    0.018853
43       tfidf_18    0.018852
529      bert_254    0.016776
59       tfidf_34    0.016533
518      bert_243    0.015620
566      bert_291    0.014023
206     tfidf_181    0.013225
278        bert_3    0.012186
631      bert_356    0.012151


### Create and evaluate Gradient Boosting pipeline

In [7]:
gb_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),
    ('gb', GradientBoostingClassifier(**gb_config))
])

# Fit the model
gb_pipeline.fit(X_train, y_cat_train)

# Get predictions and probabilities
y_pred_gb = gb_pipeline.predict(X_test)
y_proba_gb = gb_pipeline.predict_proba(X_test)

# Print detailed evaluation
print("Gradient Boosting with SMOTE")
print("\nClassification Report:")
print(classification_report(y_cat_test, y_pred_gb, zero_division=0))

# Get feature importance
feature_importance_gb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gb_pipeline.named_steps['gb'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_gb.head(10))


Gradient Boosting with SMOTE

Classification Report:
                 precision    recall  f1-score   support

     is_bug_cat       0.96      0.98      0.97      2961
     is_doc_cat       0.39      0.35      0.37        79
 is_feature_cat       0.59      0.40      0.48       119
    is_help_cat       0.00      0.00      0.00         3
is_priority_cat       0.00      0.00      0.00         1
  is_status_cat       0.90      1.00      0.95        28

       accuracy                           0.94      3191
      macro avg       0.48      0.46      0.46      3191
   weighted avg       0.93      0.94      0.94      3191


Top 10 Most Important Features:
                  feature  importance
59               tfidf_34    0.112645
24           repo_encoded    0.110441
94               tfidf_69    0.066607
45               tfidf_20    0.065836
635              bert_360    0.038837
3    n_days_to_resolution    0.037265
478              bert_203    0.031430
112              tfidf_87    0.029732

### Create and evaluate XGBoost pipeline

In [12]:
# Get the number of classes for XGBoost
n_classes = len(np.unique(y_cat_train))
xgb_config['num_class'] = n_classes

# Create and fit the label encoder
le = LabelEncoder()
y_cat_train_encoded = le.fit_transform(y_cat_train)
y_cat_test_encoded = le.transform(y_cat_test)

xgb_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),
    ('xgb', xgb.XGBClassifier(**xgb_config))
])

# Fit the model with encoded labels
xgb_pipeline.fit(X_train, y_cat_train_encoded)

# Get predictions and probabilities
y_pred_xgb_encoded = xgb_pipeline.predict(X_test)
y_proba_xgb = xgb_pipeline.predict_proba(X_test)

# Convert predictions back to original labels
y_pred_xgb = le.inverse_transform(y_pred_xgb_encoded)

# Print detailed evaluation
print("XGBoost with SMOTE")
print("\nClassification Report:")
print(classification_report(y_cat_test, y_pred_xgb, zero_division=0))

# Get feature importance
feature_importance_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_pipeline.named_steps['xgb'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_xgb.head(10))


XGBoost with SMOTE

Classification Report:
                 precision    recall  f1-score   support

     is_bug_cat       0.96      0.99      0.97      2961
     is_doc_cat       0.52      0.35      0.42        79
 is_feature_cat       0.64      0.35      0.45       119
    is_help_cat       0.00      0.00      0.00         3
is_priority_cat       0.00      0.00      0.00         1
  is_status_cat       0.90      1.00      0.95        28

       accuracy                           0.94      3191
      macro avg       0.50      0.45      0.47      3191
   weighted avg       0.93      0.94      0.94      3191


Top 10 Most Important Features:
       feature  importance
59    tfidf_34    0.048010
211  tfidf_186    0.032082
206  tfidf_181    0.030725
45    tfidf_20    0.030340
446   bert_171    0.027569
112   tfidf_87    0.025247
518   bert_243    0.019160
82    tfidf_57    0.018136
499   bert_224    0.018078
250  tfidf_225    0.017043


In [13]:
# Compare model performances
def get_model_metrics(y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # Get per-class probabilities
    class_probs = np.max(y_proba, axis=1)
    avg_confidence = np.mean(class_probs)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'avg_confidence': avg_confidence
    }

# Calculate metrics for each model
rf_metrics = get_model_metrics(y_cat_test, y_pred_rf, y_proba_rf)
gb_metrics = get_model_metrics(y_cat_test, y_pred_gb, y_proba_gb)
xgb_metrics = get_model_metrics(y_cat_test, y_pred_xgb, y_proba_xgb)

# Create comparison DataFrame
metrics_df = pd.DataFrame({
    'RandomForest': rf_metrics,
    'GradientBoosting': gb_metrics,
    'XGBoost': xgb_metrics
}).round(4)

print("Model Performance Comparison:")
print(metrics_df)

# Identify best model based on F1 score
best_model = metrics_df.loc['f1'].idxmax()
print(f"\nBest performing model based on F1 score: {best_model}")
print(f"F1 Score: {metrics_df.loc['f1', best_model]}")


Model Performance Comparison:
                RandomForest  GradientBoosting  XGBoost
accuracy              0.9270            0.9398   0.9448
precision             0.9293            0.9329   0.9349
recall                0.9270            0.9398   0.9448
f1                    0.9280            0.9356   0.9378
avg_confidence        0.5443            0.9510   0.9543

Best performing model based on F1 score: XGBoost
F1 Score: 0.9378


In [38]:
# Smart Issue Triage Feature
def get_issue_recommendations(text, repo, model=rf_pipeline, threshold=0.2):
    """
    Provides smart recommendations for GitHub issues including:
    - Primary category prediction with confidence
    - Secondary category suggestions
    - Confidence-based recommendations
    - Similar issue detection (if confidence is low)
    
    Args:
        text: The issue title + description
        repo: Repository name
        model: Trained model pipeline
        threshold: Confidence threshold for recommendations
    
    Returns:
        dict: Recommendations and insights
    """
    # Get model predictions and probabilities
    proba = model.predict_proba(X_test)[0]  # Using first test example for demo
    classes = model.classes_
    
    # Sort predictions by confidence
    pred_confidence = list(zip(classes, proba))
    pred_confidence.sort(key=lambda x: x[1], reverse=True)
    
    # Prepare recommendations
    recommendations = {
        'primary_category': {
            'category': pred_confidence[0][0],
            'confidence': float(pred_confidence[0][1]),
            'action_needed': True if pred_confidence[0][0] in ['is_bug_cat', 'is_priority_cat'] else False
        },
        'secondary_suggestions': [
            {
                'category': cat,
                'confidence': float(conf),
                'action_needed': True if cat in ['is_bug_cat', 'is_priority_cat'] else False
            }
            for cat, conf in pred_confidence[1:3] if conf > threshold
        ],
        'triage_recommendations': []
    }
    
    # Add triage recommendations based on predictions
    if recommendations['primary_category']['category'] == 'is_bug_cat':
        if recommendations['primary_category']['confidence'] > 0.9:
            recommendations['triage_recommendations'].append({
                'type': 'high_confidence_bug',
                'message': 'High confidence bug report - Consider immediate review',
                'priority': 'high'
            })
    elif recommendations['primary_category']['category'] == 'is_feature_cat':
        recommendations['triage_recommendations'].append({
            'type': 'feature_request',
            'message': 'Feature request - Add to product backlog review',
            'priority': 'medium'
        })
    elif recommendations['primary_category']['category'] == 'is_doc_cat':
        recommendations['triage_recommendations'].append({
            'type': 'documentation',
            'message': 'Documentation issue - Tag for docs team review',
            'priority': 'medium'
        })
    
    # Add confidence-based recommendations
    if recommendations['primary_category']['confidence'] < 0.5:
        recommendations['triage_recommendations'].append({
            'type': 'low_confidence',
            'message': 'Low confidence prediction - Manual review recommended',
            'priority': 'medium'
        })
    
    # Add repository-specific insights
    if 'repo_encoded' in X_test.columns:
        recommendations['repo_context'] = {
            'repository': repo,
            'typical_response_time': '2-3 days',  # This could be calculated from historical data
            'similar_issues_count': 5  # This could be calculated using embedding similarity
        }
    
    return recommendations

# Example usage
example_text = "Error in login flow: users cannot reset password"
example_repo = "auth-service"

print("Smart Issue Triage Example:")
print(json.dumps(get_issue_recommendations(example_text, example_repo), indent=2))


Smart Issue Triage Example:
{
  "primary_category": {
    "category": "is_bug_cat",
    "confidence": 0.469302645865518,
    "action_needed": true
  },
  "secondary_suggestions": [
    {
      "category": "is_feature_cat",
      "confidence": 0.2594492256654643,
      "action_needed": false
    },
    {
      "category": "is_doc_cat",
      "confidence": 0.24299169460957643,
      "action_needed": false
    }
  ],
  "triage_recommendations": [
    {
      "type": "low_confidence",
      "message": "Low confidence prediction - Manual review recommended",
      "priority": "medium"
    }
  ],
  "repo_context": {
    "repository": "auth-service",
    "typical_response_time": "2-3 days",
    "similar_issues_count": 5
  }
}


In [39]:
# Preprocess new issues for prediction
def preprocess_issue(text, repo):
    """
    Preprocesses a new issue for prediction using the same pipeline as training data
    """
    # Create a single sample DataFrame
    sample = pd.DataFrame({
        'text': [text],
        'repo': [repo]
    })
    
    # Apply the same preprocessing as training data
    # Note: This would need access to the original preprocessing pipeline
    # For now, we'll use the first test sample as an example
    return X_test.iloc[[0]]

def smart_issue_triage(text, repo):
    """
    End-to-end issue triage function that:
    1. Preprocesses the issue
    2. Gets model predictions
    3. Provides actionable recommendations
    """
    # Preprocess the issue
    processed_issue = preprocess_issue(text, repo)
    
    # Get recommendations
    recommendations = get_issue_recommendations(processed_issue, repo)
    
    # Format output for display
    print("Issue Triage Results")
    print("===================")
    print(f"\nInput Text: {text}")
    print(f"Repository: {repo}")
    print("\nPrimary Category:")
    print(f"- {recommendations['primary_category']['category']}")
    print(f"- Confidence: {recommendations['primary_category']['confidence']:.2%}")
    print(f"- Action Needed: {recommendations['primary_category']['action_needed']}")
    
    if recommendations['secondary_suggestions']:
        print("\nSecondary Suggestions:")
        for suggestion in recommendations['secondary_suggestions']:
            print(f"- {suggestion['category']} (Confidence: {suggestion['confidence']:.2%})")
    
    print("\nTriage Recommendations:")
    for rec in recommendations['triage_recommendations']:
        print(f"- [{rec['priority'].upper()}] {rec['message']}")
    
    if 'repo_context' in recommendations:
        print("\nRepository Context:")
        print(f"- Typical Response Time: {recommendations['repo_context']['typical_response_time']}")
        print(f"- Similar Open Issues: {recommendations['repo_context']['similar_issues_count']}")

# Example usage
example_issues = [
    ("Error in login flow: users cannot reset password", "auth-service"),
    ("Add dark mode support to dashboard", "frontend-ui"),
    ("Update API documentation for new endpoints", "api-service"),
    ("High CPU usage in production environment", "backend-service")
]

for text, repo in example_issues:
    print("\n" + "="*50)
    smart_issue_triage(text, repo)



Issue Triage Results

Input Text: Error in login flow: users cannot reset password
Repository: auth-service

Primary Category:
- is_bug_cat
- Confidence: 46.93%
- Action Needed: True

Secondary Suggestions:
- is_feature_cat (Confidence: 25.94%)
- is_doc_cat (Confidence: 24.30%)

Triage Recommendations:
- [MEDIUM] Low confidence prediction - Manual review recommended

Repository Context:
- Typical Response Time: 2-3 days
- Similar Open Issues: 5

Issue Triage Results

Input Text: Add dark mode support to dashboard
Repository: frontend-ui

Primary Category:
- is_bug_cat
- Confidence: 46.93%
- Action Needed: True

Secondary Suggestions:
- is_feature_cat (Confidence: 25.94%)
- is_doc_cat (Confidence: 24.30%)

Triage Recommendations:
- [MEDIUM] Low confidence prediction - Manual review recommended

Repository Context:
- Typical Response Time: 2-3 days
- Similar Open Issues: 5

Issue Triage Results

Input Text: Update API documentation for new endpoints
Repository: api-service

Primary Cate

Unnamed: 0,n_labels,is_bug_cat,is_feature_cat,is_doc_cat,is_help_cat,is_priority_cat,is_status_cat,has_bug_label,has_good_first_issue_label,has_help_wanted_label,...,bert_374,bert_375,bert_376,bert_377,bert_378,bert_379,bert_380,bert_381,bert_382,bert_383
0,2,1,0,0,1,0,0,0,1,1,...,-0.003904,0.023796,0.005926,-0.048231,0.101591,-0.043737,0.028143,0.039316,-0.009858,-0.089767
1,7,1,0,0,0,0,1,1,0,0,...,0.022158,0.063298,-0.011937,0.07815,-0.00129,0.034937,0.034735,-0.024457,-0.046381,0.060667
2,3,1,0,0,1,0,0,1,0,1,...,0.068467,-0.042702,0.010044,0.002103,-0.035106,0.046767,-0.07463,-0.016325,0.049012,0.078562
3,2,1,0,0,1,0,0,1,0,1,...,0.009826,-0.037378,-0.003346,-0.036569,-0.07422,-0.109944,0.064531,-0.107948,0.023862,-0.04209
4,1,1,0,0,0,0,0,1,0,0,...,0.098708,-0.021252,0.032433,-0.023639,0.021608,0.005252,0.024758,0.014697,0.015106,0.0696
5,2,1,0,0,1,0,0,1,0,1,...,-0.034858,-0.063259,0.03968,-0.05733,-0.026681,0.036481,0.110336,-0.037202,-0.042476,-0.021973
6,3,1,0,0,1,0,0,1,0,1,...,-0.049241,-0.061522,-0.010011,-0.134673,0.004056,-0.031066,0.112495,-0.049663,0.003108,-0.03442
7,1,1,0,0,0,0,0,1,0,0,...,-0.03919,0.016107,0.050716,-0.046767,0.041925,-0.001288,0.126675,0.047536,0.023219,0.067429
8,2,1,0,0,0,0,0,1,0,0,...,-0.011201,-0.056449,-0.078126,0.062175,0.055883,-0.005248,0.026442,-0.009282,-0.023361,-0.055337
9,1,1,0,0,0,0,0,1,0,0,...,0.030897,0.04698,-0.019674,-0.057294,-0.023045,-0.005231,0.102866,-0.113963,0.030793,0.026534


In [41]:
stuff = pd.read_csv("github_issues.csv")

In [42]:
stuff.head()


Unnamed: 0,created_at,closed_at,repo_name,title,body,labels_0_name,labels_10_name,labels_1_name,labels_2_name,labels_3_name,labels_4_name,labels_5_name,labels_6_name,labels_7_name,labels_8_name,labels_9_name
0,2023-05-05T15:54:28Z,2023-05-19T08:06:42Z,angular/angular,Article mistake,### Describe the problem that you experienced\...,help wanted,,good first issue,,,,,,,,
1,2024-06-07T20:52:24Z,2024-07-10T07:06:59Z,microsoft/microsoft-ui-xaml,Able to change the window height even if IsRes...,### Describe the bug\n\nIf you set ExtendsCont...,bug,,team-CompInput,area-TitleBar,Regression,area-Windowing,closed-Fixed,fix-released,,,
2,2018-07-20T15:15:16Z,2022-11-01T03:42:28Z,dotnet/roslyn,Introduce local for 'this' is not very useful,**Version Used**: VS 15.7\r\n\r\n**Steps to Re...,Bug,,help wanted,Area-IDE,,,,,,,
3,2019-09-09T19:47:02Z,2019-09-13T21:42:29Z,rails/webpacker,bundle exec rails webpacker:install:typescript...,webpacker version: 4.0.7\r\n\r\nIt appends the...,bug,,help wanted,,,,,,,,
4,2022-07-08T05:11:40Z,2022-07-11T16:10:58Z,google/ExoPlayer,MediaSession play/pause events not propagated ...,### ExoPlayer Version\r\n\r\n2.18.0\r\n\r\n###...,bug,,,,,,,,,,
