In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

In [82]:
TRAIN_VAL_DATASET = "final_datasets/train_val/final.csv"
TEST_DATASET = "final_datasets/test/final.csv"

In [83]:
df_train_val = pd.read_csv(TRAIN_VAL_DATASET)
df_test = pd.read_csv(TEST_DATASET)

In [84]:
common = pd.merge(df_train_val, df_test, how='inner')
print(common)
print("Number of identical rows:", len(common))

Empty DataFrame
Columns: [author_acceptance_rate, core_contributor_flag, previous_prs, year_created, time_to_close, comments_burstiness, num_comments, num_reviewers, avg_max_args, avg_multi_comments, max_max_cc, min_call_count, pr_time_label]
Index: []
Number of identical rows: 0


In [85]:
df_train_val.round()
df_train_val

Unnamed: 0,author_acceptance_rate,core_contributor_flag,previous_prs,year_created,time_to_close,comments_burstiness,num_comments,num_reviewers,avg_max_args,avg_multi_comments,max_max_cc,min_call_count,pr_time_label
0,0.0,0,0,2025,0.067222,0.000000,2,2,9.000000,135.000000,10.0,88.0,rejected
1,0.0,0,1,2025,42.694444,0.000000,1,1,9.000000,135.000000,10.0,88.0,rejected
2,0.0,0,0,2024,4293.194444,0.000000,1,1,2.000000,58.000000,1.0,236.0,accepted
3,0.0,0,0,2024,4.407222,0.586390,3,3,3.500000,76.500000,6.0,63.0,accepted
4,0.0,0,0,2020,0.921667,0.000000,0,0,17.000000,0.000000,17.0,251.0,rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0,0,2024,932.561667,0.000000,0,0,2.000000,12.500000,13.0,0.0,rejected
5996,0.0,0,0,2020,1.350833,0.998579,3,1,5.000000,20.000000,21.0,206.0,rejected
5997,0.0,0,1,2021,28.285000,2.063491,8,1,2.500000,22.500000,8.0,64.0,rejected
5998,0.0,0,2,2021,0.135833,0.000000,0,0,4.307692,88.923077,27.0,1.0,rejected


In [86]:
print(df_train_val.columns.tolist())

['author_acceptance_rate', 'core_contributor_flag', 'previous_prs', 'year_created', 'time_to_close', 'comments_burstiness', 'num_comments', 'num_reviewers', 'avg_max_args', 'avg_multi_comments', 'max_max_cc', 'min_call_count', 'pr_time_label']


In [87]:
df_test = df_test[df_train_val.columns]
df_test.round()
df_test

Unnamed: 0,author_acceptance_rate,core_contributor_flag,previous_prs,year_created,time_to_close,comments_burstiness,num_comments,num_reviewers,avg_max_args,avg_multi_comments,max_max_cc,min_call_count,pr_time_label
0,0.0,0,0,2025,14.806111,0.000000,1,1,11.0000,1467.000000,36.0,250.0,rejected
1,0.0,0,0,2018,1173.121389,0.674944,10,5,4.0625,44.437500,30.0,79.0,rejected
2,0.0,0,1,2018,0.214722,0.000000,0,0,5.0000,1067.000000,15.0,287.0,rejected
3,0.0,0,0,2025,0.958889,0.000000,0,0,3.0000,6.500000,13.0,26.0,rejected
4,0.0,0,1,2025,1308.575556,0.000000,2,2,11.0000,187.000000,56.0,142.0,rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6951,0.0,0,0,2025,1.478611,0.000000,2,2,5.0000,715.000000,25.0,95.0,accepted
6952,1.0,0,1,2025,18.141667,0.000000,1,1,6.0000,380.666667,15.0,337.0,accepted
6953,0.0,0,0,2024,174.446389,0.000000,1,1,5.0000,11.000000,8.0,4097.0,accepted
6954,0.0,0,0,2017,1241.131389,2.558994,12,3,11.5000,369.000000,39.0,243.0,rejected


In [88]:
common = pd.merge(df_train_val, df_test, how='inner')
print(common)
print("Number of identical rows:", len(common))

Empty DataFrame
Columns: [author_acceptance_rate, core_contributor_flag, previous_prs, year_created, time_to_close, comments_burstiness, num_comments, num_reviewers, avg_max_args, avg_multi_comments, max_max_cc, min_call_count, pr_time_label]
Index: []
Number of identical rows: 0


In [89]:
label_values = sorted(df_train_val['pr_time_label'].unique(), reverse = True)
label_map = {label: i for i, label in enumerate(label_values)}
print("LABEL MAP =", label_map)

LABEL MAP = {'rejected': 0, 'accepted': 1}


In [90]:
X = df_train_val.drop("pr_time_label", axis=1)
y = df_train_val['pr_time_label'].map(label_map).values
print("Encoded labels:", np.unique(y))

Encoded labels: [0 1]


In [91]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}")
print("Distribution train: ", np.unique(y_train, return_counts=True))
print("Distribution val: ", np.unique(y_val, return_counts=True))

Train size: 4800, Validation size: 1200
Distribution train:  (array([0, 1]), array([2009, 2791]))
Distribution val:  (array([0, 1]), array([502, 698]))


In [92]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print("Class weights:", class_weights_dict)

Class weights: {0: np.float64(1.1946241911398705), 1: np.float64(0.8599068434252956)}


In [93]:
X_test = df_test.drop("pr_time_label", axis=1)
y_test = df_test['pr_time_label'].map(label_map).values
print("Encoded labels:", np.unique(y_test))
print("Distribution test: ", np.unique(y_test, return_counts=True))

Encoded labels: [0 1]
Distribution test:  (array([0, 1]), array([3910, 3046]))


In [94]:
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=15,
    min_samples_split=50,
    min_samples_leaf=50,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_pred_train = rf.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, rf_pred_train))
print(classification_report(y_train, rf_pred_train, digits=4))

rf_pred_val = rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, rf_pred_val))
print(classification_report(y_val, rf_pred_val, digits=4))

rf_pred_test = rf.predict(X_test)
print("Testing Accuracy:", accuracy_score(y_test, rf_pred_test))
print(classification_report(y_test, rf_pred_test, digits=4))

Training Accuracy: 0.8260416666666667
              precision    recall  f1-score   support

           0     0.7817    0.8109    0.7960      2009
           1     0.8601    0.8370    0.8484      2791

    accuracy                         0.8260      4800
   macro avg     0.8209    0.8239    0.8222      4800
weighted avg     0.8273    0.8260    0.8265      4800

Validation Accuracy: 0.8083333333333333
              precision    recall  f1-score   support

           0     0.7636    0.7849    0.7741       502
           1     0.8421    0.8252    0.8336       698

    accuracy                         0.8083      1200
   macro avg     0.8028    0.8050    0.8038      1200
weighted avg     0.8092    0.8083    0.8087      1200

Testing Accuracy: 0.8225991949396205
              precision    recall  f1-score   support

           0     0.8202    0.8765    0.8474      3910
           1     0.8261    0.7534    0.7881      3046

    accuracy                         0.8226      6956
   macro avg 

In [95]:
xgb = XGBClassifier(
    n_estimators=330,       # OK fewer trees for stability
    max_depth=2,            # OK slightly deeper than before
    learning_rate=0.118,    # OK slightly higher to learn faster
    subsample=0.8,          # OK row sampling
    colsample_bytree=0.9,   # OK feature sampling
    gamma=0.1,              # OK minimum loss reduction
    reg_alpha=0.05,         # OK mild L1 regularization
    reg_lambda=1.0,         # OK L2 regularization
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=50
)

xgb_pred_train = xgb.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, xgb_pred_train))
print(classification_report(y_train, xgb_pred_train, digits=4))

xgb_pred_val = xgb.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, xgb_pred_val))
print(classification_report(y_val, xgb_pred_val, digits=4))

xgb_pred_test = xgb.predict(X_test)
print("Testing Accuracy:", accuracy_score(y_test, xgb_pred_test))
print(classification_report(y_test, xgb_pred_test, digits=4))

[0]	validation_0-logloss:0.64281
[50]	validation_0-logloss:0.42712
[100]	validation_0-logloss:0.41077
[150]	validation_0-logloss:0.40601
[200]	validation_0-logloss:0.40404
[250]	validation_0-logloss:0.40151
[300]	validation_0-logloss:0.40081
[329]	validation_0-logloss:0.40156
Training Accuracy: 0.8541666666666666
              precision    recall  f1-score   support

           0     0.8386    0.8069    0.8224      2009
           1     0.8647    0.8882    0.8763      2791

    accuracy                         0.8542      4800
   macro avg     0.8516    0.8475    0.8494      4800
weighted avg     0.8538    0.8542    0.8537      4800

Validation Accuracy: 0.8166666666666667
              precision    recall  f1-score   support

           0     0.8000    0.7490    0.7737       502
           1     0.8274    0.8653    0.8459       698

    accuracy                         0.8167      1200
   macro avg     0.8137    0.8072    0.8098      1200
weighted avg     0.8159    0.8167    0.8157   

In [96]:
y_proba = xgb.predict_proba(X_test)[:,1]
threshold = 0.58
y_pred = (y_proba >= threshold).astype(int)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7952    0.8847    0.8375      3910
           1     0.8269    0.7075    0.7626      3046

    accuracy                         0.8071      6956
   macro avg     0.8111    0.7961    0.8000      6956
weighted avg     0.8091    0.8071    0.8047      6956



In [97]:
cat = CatBoostClassifier(
    iterations=10000,
    learning_rate=0.01,
    depth=1,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=50
)

cat.fit(X_train, y_train, eval_set=(X_val, y_val))

cat_pred_train = cat.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, cat_pred_train))
print(classification_report(y_train, cat_pred_train, digits=4))

cat_pred_val = cat.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, cat_pred_val))
print(classification_report(y_val, cat_pred_val, digits=4))

cat_pred_test = cat.predict(X_test)
print("Testing Accuracy:", accuracy_score(y_test, cat_pred_test))
print(classification_report(y_test, cat_pred_test, digits=4))

0:	learn: 0.7714583	test: 0.7791667	best: 0.7791667 (0)	total: 1.3ms	remaining: 13s
50:	learn: 0.7704167	test: 0.7800000	best: 0.7808333 (25)	total: 83.2ms	remaining: 16.2s
100:	learn: 0.7706250	test: 0.7800000	best: 0.7808333 (25)	total: 161ms	remaining: 15.8s
150:	learn: 0.7725000	test: 0.7800000	best: 0.7808333 (25)	total: 237ms	remaining: 15.5s
200:	learn: 0.7737500	test: 0.7808333	best: 0.7808333 (25)	total: 311ms	remaining: 15.2s
250:	learn: 0.7739583	test: 0.7808333	best: 0.7808333 (25)	total: 385ms	remaining: 14.9s
300:	learn: 0.7752083	test: 0.7800000	best: 0.7808333 (25)	total: 463ms	remaining: 14.9s
350:	learn: 0.7756250	test: 0.7808333	best: 0.7808333 (25)	total: 540ms	remaining: 14.8s
400:	learn: 0.7758333	test: 0.7816667	best: 0.7816667 (352)	total: 622ms	remaining: 14.9s
450:	learn: 0.7775000	test: 0.7808333	best: 0.7816667 (352)	total: 705ms	remaining: 14.9s
500:	learn: 0.7775000	test: 0.7816667	best: 0.7816667 (352)	total: 782ms	remaining: 14.8s
550:	learn: 0.7781250	t

In [99]:
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()

In [100]:
clf = TabNetClassifier(
    n_d=128, n_a=128,
    n_steps=20,
    gamma=1.5,
    lambda_sparse=1e-4,
    mask_type="entmax",

    optimizer_fn=torch.optim.AdamW,
    optimizer_params=dict(lr=0.009, weight_decay=1e-4),

    scheduler_fn=torch.optim.lr_scheduler.CosineAnnealingLR,
    scheduler_params={"T_max":50, "eta_min":1e-5}
)

clf.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'test'],
    eval_metric=['accuracy', 'auc'],
    max_epochs=20,
    batch_size=128,
    virtual_batch_size=64,
    patience=50,
    drop_last=False
)

tab_pred_train = clf.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, tab_pred_train))
print(classification_report(y_train, tab_pred_train, digits=4))

tab_pred_val = clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, tab_pred_val))
print(classification_report(y_val, tab_pred_val, digits=4))

tab_pred_test = clf.predict(X_test)
print("Testing Accuracy:", accuracy_score(y_test, tab_pred_test))
print(classification_report(y_test, tab_pred_test, digits=4))



epoch 0  | loss: 3.33162 | train_accuracy: 0.57042 | train_auc: 0.52204 | test_accuracy: 0.58833 | test_auc: 0.54071 |  0:00:21s
epoch 1  | loss: 2.7854  | train_accuracy: 0.51021 | train_auc: 0.46045 | test_accuracy: 0.505   | test_auc: 0.45237 |  0:00:51s
epoch 2  | loss: 2.10385 | train_accuracy: 0.57542 | train_auc: 0.49964 | test_accuracy: 0.575   | test_auc: 0.49809 |  0:01:17s
epoch 3  | loss: 1.39215 | train_accuracy: 0.43854 | train_auc: 0.45936 | test_accuracy: 0.47333 | test_auc: 0.48599 |  0:01:44s
epoch 4  | loss: 1.62114 | train_accuracy: 0.57729 | train_auc: 0.49405 | test_accuracy: 0.57667 | test_auc: 0.49269 |  0:02:08s
epoch 5  | loss: 1.19028 | train_accuracy: 0.58208 | train_auc: 0.48855 | test_accuracy: 0.58083 | test_auc: 0.49047 |  0:02:34s
epoch 6  | loss: 3.4203  | train_accuracy: 0.61    | train_auc: 0.61405 | test_accuracy: 0.62083 | test_auc: 0.62441 |  0:03:07s
epoch 7  | loss: 0.79329 | train_accuracy: 0.60396 | train_auc: 0.59426 | test_accuracy: 0.60167 



Training Accuracy: 0.7675
              precision    recall  f1-score   support

           0     0.7211    0.7247    0.7229      2009
           1     0.8012    0.7983    0.7997      2791

    accuracy                         0.7675      4800
   macro avg     0.7611    0.7615    0.7613      4800
weighted avg     0.7677    0.7675    0.7676      4800

Validation Accuracy: 0.7808333333333334
              precision    recall  f1-score   support

           0     0.7414    0.7311    0.7362       502
           1     0.8085    0.8166    0.8125       698

    accuracy                         0.7808      1200
   macro avg     0.7750    0.7738    0.7744      1200
weighted avg     0.7804    0.7808    0.7806      1200

Testing Accuracy: 0.768688901667625
              precision    recall  f1-score   support

           0     0.8094    0.7698    0.7891      3910
           1     0.7220    0.7672    0.7439      3046

    accuracy                         0.7687      6956
   macro avg     0.7657   