<a href="https://colab.research.google.com/gist/bodhichristian/1040e60dc4d16ea7092f13caf69fecf9/model-experimentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Experimentation

In this step, we'll experiment with various models for modeling the GitHub Issues dataset to establish benchmarks and find the most performmative models for the data.



### Imports

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score, accuracy_score, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN

### Load dataframe

In [2]:
df = pd.read_csv('/content/drive/MyDrive/capstone/github_issues_numeric.csv')
df.head()

Unnamed: 0,has_bug_label,has_help_wanted_label,has_enhancement_label,has_documentation_label,has_regression_label,has_question_label,has_type__bug_label,has_docs_label,has_type_bug_label,n_days_to_resolution,...,bert_374,bert_375,bert_376,bert_377,bert_378,bert_379,bert_380,bert_381,bert_382,bert_383
0,0,1,0,0,0,0,0,0,0,13.68,...,-0.003904,0.023796,0.005926,-0.048231,0.101591,-0.043737,0.028143,0.039316,-0.009858,-0.089767
1,1,0,0,0,1,0,0,0,0,32.43,...,0.022158,0.063298,-0.011937,0.07815,-0.00129,0.034937,0.034735,-0.024457,-0.046381,0.060667
2,1,1,0,0,0,0,0,0,0,1564.52,...,0.068467,-0.042702,0.010044,0.002103,-0.035106,0.046767,-0.07463,-0.016325,0.049012,0.078562
3,1,1,0,0,0,0,0,0,0,4.08,...,0.009826,-0.037378,-0.003346,-0.036569,-0.07422,-0.109944,0.064531,-0.107948,0.023862,-0.04209
4,1,0,0,0,0,0,0,0,0,3.46,...,0.098708,-0.021252,0.032433,-0.023639,0.021608,0.005252,0.024758,0.014697,0.015106,0.0696


### Process data

In [3]:
# Combine related labels
df['has_bug_label'] = ((df['has_bug_label'] == 1) | (df['has_type__bug_label'] == 1)).astype(int)
df['has_documentation_label'] = ((df['has_documentation_label'] == 1) | (df['has_docs_label'] == 1)).astype(int)
df['has_help_wanted_label'] = ((df['has_help_wanted_label'] == 1) | (df['has_question_label'] == 1)).astype(int)

# Drop unused columns
cols_to_drop = ['has_type__bug_label', 'has_type_bug_label', 'has_docs_label', 'has_question_label']
df.drop(columns=cols_to_drop, inplace=True)
df.head()

Unnamed: 0,has_bug_label,has_help_wanted_label,has_enhancement_label,has_documentation_label,has_regression_label,n_days_to_resolution,title_length,body_length,title_word_count,body_word_count,...,bert_374,bert_375,bert_376,bert_377,bert_378,bert_379,bert_380,bert_381,bert_382,bert_383
0,0,1,0,0,0,13.68,15,1107,2,166,...,-0.003904,0.023796,0.005926,-0.048231,0.101591,-0.043737,0.028143,0.039316,-0.009858,-0.089767
1,1,0,0,0,1,32.43,100,618,15,94,...,0.022158,0.063298,-0.011937,0.07815,-0.00129,0.034937,0.034735,-0.024457,-0.046381,0.060667
2,1,1,0,0,0,1564.52,45,511,8,88,...,0.068467,-0.042702,0.010044,0.002103,-0.035106,0.046767,-0.07463,-0.016325,0.049012,0.078562
3,1,1,0,0,0,4.08,104,268,10,25,...,0.009826,-0.037378,-0.003346,-0.036569,-0.07422,-0.109944,0.064531,-0.107948,0.023862,-0.04209
4,1,0,0,0,0,3.46,70,1464,7,175,...,0.098708,-0.021252,0.032433,-0.023639,0.021608,0.005252,0.024758,0.014697,0.015106,0.0696


In [4]:
# Export for reuse
# df.to_csv('github_issues_ready')

### Explore imbalance

The dataset is quite unbalanced with the majority class has_bug_label constituting 85% of the samples. We'll explore a baseline assessment without any balancing, along with several balancing techniques such as Random Oversampling, SMOTE, and more.

In [5]:
label_cols = [col for col in df.columns if col.startswith("has_") and col.endswith("_label")]

print(df[label_cols].sum().sort_values(ascending=False))

has_bug_label              13380
has_help_wanted_label        775
has_documentation_label      684
has_enhancement_label        636
has_regression_label         134
dtype: int64


In [6]:
targets = [
    "has_bug_label",
    "has_help_wanted_label",
    "has_documentation_label",
    "has_enhancement_label",
    "has_regression_label"
]

X = df.drop(columns=targets)
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Model training comparison

In [7]:
# Random Forest
rf = MultiOutputClassifier(RandomForestClassifier(random_state=42))
rf.fit(X_train, y_train)
rf_preds = np.array(rf.predict(X_test))

print("=== Random Forest ===")
print("Accuracy:", accuracy_score(np.array(y_test), rf_preds))
print(classification_report(np.array(y_test), rf_preds, zero_division=0))

# XGBoost
xgb = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
xgb.fit(X_train, y_train)
xgb_preds = np.array(xgb.predict(X_test))

print("=== XGBoost ===")
print("Accuracy:", accuracy_score(np.array(y_test), xgb_preds))
print(classification_report(np.array(y_test), xgb_preds, zero_division=0))

=== Random Forest ===
Accuracy: 0.7862738953306173
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      2649
           1       1.00      0.01      0.01       172
           2       1.00      0.02      0.04       143
           3       1.00      0.02      0.03       128
           4       0.00      0.00      0.00        24

   micro avg       0.84      0.85      0.85      3116
   macro avg       0.77      0.21      0.20      3116
weighted avg       0.86      0.85      0.78      3116
 samples avg       0.83      0.81      0.82      3116



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
Accuracy: 0.818238796615481
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2649
           1       0.58      0.04      0.08       172
           2       0.73      0.27      0.39       143
           3       0.57      0.24      0.34       128
           4       0.00      0.00      0.00        24

   micro avg       0.90      0.85      0.87      3116
   macro avg       0.56      0.30      0.35      3116
weighted avg       0.86      0.85      0.83      3116
 samples avg       0.83      0.81      0.81      3116



## Explore balancing the data

In [8]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
multi_rf = MultiOutputClassifier(rf)
multi_rf.fit(X_train, y_train)

In [9]:
y_pred = multi_rf.predict(X_test)

In [10]:
for i, col in enumerate(y_test.columns):
    print(f"--- {col} ---")
    print(classification_report(y_test[col], y_pred[:, i]))

--- has_bug_label ---
              precision    recall  f1-score   support

           0       0.92      0.04      0.08       542
           1       0.84      1.00      0.91      2649

    accuracy                           0.84      3191
   macro avg       0.88      0.52      0.50      3191
weighted avg       0.85      0.84      0.77      3191

--- has_help_wanted_label ---
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3019
           1       0.00      0.00      0.00       172

    accuracy                           0.95      3191
   macro avg       0.47      0.50      0.49      3191
weighted avg       0.90      0.95      0.92      3191

--- has_documentation_label ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3048
           1       1.00      0.03      0.07       143

    accuracy                           0.96      3191
   macro avg       0.98      0.52      0.52 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
brf = MultiOutputClassifier(BalancedRandomForestClassifier(n_estimators=100, random_state=42))
brf.fit(X_train, y_train)

### Hyperparameter tuning

In [14]:
rf_base = RandomForestClassifier(class_weight='balanced', random_state=42)
multi_rf = MultiOutputClassifier(rf_base)

param_dist = {
    'estimator__n_estimators': [50, 100],          # smaller number of trees
    'estimator__max_depth': [None, 10, 20],       # fewer depths
    'estimator__min_samples_split': [2, 5],       # fewer options
    'estimator__min_samples_leaf': [1, 2],        # fewer options
    'estimator__max_features': ['sqrt', 'log2']   # remove None for now
}

random_search = RandomizedSearchCV(
    multi_rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
    random_state=42
)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [15]:
print("Best hyperparameters:", random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = np.array(best_model.predict(X_test))

print("Accuracy:", accuracy_score(y_test, y_pred))
for i, col in enumerate(y_test.columns):
    print(f"--- {col} ---")
    print(classification_report(y_test[col], y_pred[:, i], zero_division=0))

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_micro = f1_score(y_test, y_pred, average='micro')
hl = hamming_loss(y_test, y_pred)

print(f"F1 macro: {f1_macro:.4f}")
print(f"F1 micro: {f1_micro:.4f}")
print(f"Hamming loss: {hl:.4f}")

Best hyperparameters: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 1, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 10}
Accuracy: 0.795675336884989
--- has_bug_label ---
              precision    recall  f1-score   support

           0       0.74      0.41      0.53       542
           1       0.89      0.97      0.93      2649

    accuracy                           0.88      3191
   macro avg       0.81      0.69      0.73      3191
weighted avg       0.86      0.88      0.86      3191

--- has_help_wanted_label ---
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3019
           1       0.67      0.01      0.02       172

    accuracy                           0.95      3191
   macro avg       0.81      0.51      0.50      3191
weighted avg       0.93      0.95      0.92      3191

--- has_documentation_label ---
              precision    recall  f1-score   s