# Model Selection

Leveraging the dataset found in github_issues_processed.csv we are going to train an ML model capable of making predictions for issue category as well as issue labels.

This is a step towards a predictive experience for users that are submitting a GitHub issue. after the user has update the title and body of the issue report, issue category will be predicted and recommended with a confidence score. the user will also be presented with up to 3 label suggestions for the issue, each presented with a confidence score.

### Import and Setup

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

df = pd.read_csv("github_issues_processed.csv")

### Define targets, drop source and helper features

In [35]:
# Category target
cat_targets = [
    "is_bug_cat","is_feature_cat","is_doc_cat",
    "is_help_cat","is_priority_cat","is_status_cat"
]
df["category"] = df[cat_targets].idxmax(axis=1)

# Label targets
label_targets = [
    col for col in df.columns if col.startswith("has_")
]

# Features (exclude targets + helper counts like n_labels)
exclude = cat_targets + label_targets + ["n_labels", "category"]
X = df.drop(columns=exclude)
y_cat = df["category"]
y_labels = df[label_targets]

### Split data for training and testing

In [36]:
# Train/test split
X_train, X_test, y_cat_train, y_cat_test, y_labels_train, y_labels_test = train_test_split(
    X, y_cat, y_labels, test_size=0.2, random_state=42, stratify=y_cat)

### Category Prediction

In [None]:
# --- Logistic Regression with RandomOverSampler ---
ros = RandomOverSampler(random_state=42)

cat_lr_ros = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # safe for sparse/high-dim embeddings
    ('oversample', ros),
    ('logreg', LogisticRegression(
        max_iter=5000,
        solver="saga",
        class_weight="balanced",
        n_jobs=-1
    ))
])

cat_lr_ros.fit(X_train, y_cat_train)
y_pred_lr = cat_lr_ros.predict(X_test)

print("Logistic Regression with RandomOverSampler")
print(classification_report(y_cat_test, y_pred_lr, zero_division=0))

# --- Random Forest with RandomOverSampler ---
cat_rf_ros = Pipeline([
    ('oversample', ros),
    ('rf', RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced_subsample",
        n_jobs=-1
    ))
])

cat_rf_ros.fit(X_train, y_cat_train)
y_pred_rf = cat_rf_ros.predict(X_test)

print("Random Forest with RandomOverSampler")
print(classification_report(y_cat_test, y_pred_rf, zero_division=0))