In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [2]:
def valid(submission, priority):
    coefficients_Dict = {"P1": 1, "P2": 1, "P3": 1, "ALL": 1}
    coefficients = coefficients_Dict[priority]
    priorityDays_dict = {"P1": 20, "P2": 60, "P3": 90}
    daysAllowedList = priorityDays_dict[priority]
    
    submission["prediction"] = pd.Series(submission["ClosedDay"] >= coefficients * daysAllowedList,
                                         index=submission.index)
    
    submission["truth"] = pd.Series(submission["Actual"] >= daysAllowedList, index=submission.index)
    # submission["issueType"] = test_df["issueType"]
    return submission

def evaluate_XGB(trainX, trainy, testX, testy, priority):

    pipeline = Pipeline([
        ('regressor', XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.05)),
    ])


    pipeline.fit(trainX, trainy)

    predictions = np.rint(pipeline.predict(testX)).astype(np.int64)


    submission = pd.DataFrame({'ClosedDay': predictions, 'Actual': testy['ResolvedDay'].tolist()})

    rmse = np.sqrt(mean_squared_error(predictions, testy))

    mae = mean_absolute_error(predictions, testy)

    valid(submission, priority)

    f1 = f1_score(submission["truth"], submission["prediction"], average='weighted')
    precision = precision_score(submission["truth"], submission["prediction"], average='weighted')
    recall = recall_score(submission["truth"], submission["prediction"], average='weighted')

    print(confusion_matrix(submission["truth"], submission["prediction"]))
    print(classification_report(submission["truth"], submission["prediction"]))
    return (f1, precision, recall)

def evaluate_RF(trainX, trainy, testX, testy, priority):

    pipeline = Pipeline([
        ('regressor', RandomForestRegressor(n_estimators = 100, random_state = 0)),
    ])


    pipeline.fit(trainX, trainy)

    predictions = np.rint(pipeline.predict(testX)).astype(np.int64)


    submission = pd.DataFrame({'ClosedDay': predictions, 'Actual': testy['ResolvedDay'].tolist()})

    rmse = np.sqrt(mean_squared_error(predictions, testy))

    mae = mean_absolute_error(predictions, testy)

    valid(submission, priority)

    f1 = f1_score(submission["truth"], submission["prediction"], average='weighted')
    precision = precision_score(submission["truth"], submission["prediction"], average='weighted')
    recall = recall_score(submission["truth"], submission["prediction"], average='weighted')
    print(f1, precision, recall)


    print('rmse:', rmse)
    print('mae:', mae)
    print(confusion_matrix(submission["truth"], submission["prediction"]))
    print(classification_report(submission["truth"], submission["prediction"]))
    return (f1, precision, recall)

# P1 Data 

In [3]:
print('########25% lifetime prediction point ########')
p1_25_train_url = "./data/p1_25_train.csv"
p1_25_test_url = "./data/p1_25_test.csv"
p1_train = pd.read_csv(p1_25_train_url)
p1_test = pd.read_csv(p1_25_test_url)

contextual_features = ['NCommentT', 'NActor', 'meanCommentSize', 
                      'nticketsByCreatorOpen', 'NCommentByActorsT', 'NticketsCreatedInProject',
                       'NticketsCreatedInProjectT', 'NticketsCreatedInProjectClosed',
                       'NticketsCreatedInProjectClosedT', 'NActivityInProject',
                       'NActivityInProjectT']

p1_trainX = p1_train[contextual_features]
p1_testX = p1_test[contextual_features]

y_train = p1_train[["ResolvedDay"]]
y_test = p1_test[["ResolvedDay"]]

print("Train:", len(p1_trainX), "Test:", len(p1_testX))

print()
print('-------------RF Result ------------')
p1_f1, p1_precision, p1_recall = evaluate_RF(p1_trainX, y_train, p1_testX, y_test, "P1")
print('-------------XGBoost Result ------------')
p1_f1, p1_precision, p1_recall = evaluate_XGB(p1_trainX, y_train, p1_testX, y_test, "P1")

########25% lifetime prediction point ########
Train: 2106 Test: 523

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.6243644478704369 0.6559726324766718 0.627151051625239
rmse: 8.79107404084319
mae: 6.866156787762907
[[152 140]
 [ 55 176]]
              precision    recall  f1-score   support

       False       0.73      0.52      0.61       292
        True       0.56      0.76      0.64       231

    accuracy                           0.63       523
   macro avg       0.65      0.64      0.63       523
weighted avg       0.66      0.63      0.62       523

-------------XGBoost Result ------------
[[149 143]
 [ 59 172]]
              precision    recall  f1-score   support

       False       0.72      0.51      0.60       292
        True       0.55      0.74      0.63       231

    accuracy                           0.61       523
   macro avg       0.63      0.63      0.61       523
weighted avg       0.64      0.61      0.61       523



In [4]:
print('########50% lifetime prediction point ########')
p1_50_train_url = "./data/p1_50_train.csv"
p1_50_test_url = "./data/p1_50_test.csv"
p1_train = pd.read_csv(p1_25_train_url)
p1_test = pd.read_csv(p1_25_test_url)


p1_trainX = p1_train[contextual_features]
p1_testX = p1_test[contextual_features]

y_train = p1_train[["ResolvedDay"]]
y_test = p1_test[["ResolvedDay"]]

print("Train:", len(p1_trainX), "Test:", len(p1_testX))

print()
print('-------------RF Result ------------')
p1_f1, p1_precision, p1_recall = evaluate_RF(p1_trainX, y_train, p1_testX, y_test, "P1")
print('-------------XGBoost Result ------------')
p1_f1, p1_precision, p1_recall = evaluate_XGB(p1_trainX, y_train, p1_testX, y_test, "P1")

########50% lifetime prediction point ########
Train: 2094 Test: 535

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.5863827862081331 0.619407754315436 0.5906542056074766
rmse: 9.45594803660786
mae: 7.497196261682243
[[142 156]
 [ 63 174]]
              precision    recall  f1-score   support

       False       0.69      0.48      0.56       298
        True       0.53      0.73      0.61       237

    accuracy                           0.59       535
   macro avg       0.61      0.61      0.59       535
weighted avg       0.62      0.59      0.59       535

-------------XGBoost Result ------------
[[136 162]
 [ 61 176]]
              precision    recall  f1-score   support

       False       0.69      0.46      0.55       298
        True       0.52      0.74      0.61       237

    accuracy                           0.58       535
   macro avg       0.61      0.60      0.58       535
weighted avg       0.62      0.58      0.58       535



In [5]:
print('########75% lifetime prediction point ########')
p2_75_train_url = "./data/p2_75_train.csv"
p2_75_test_url = "./data/p2_75_test.csv"
p2_train = pd.read_csv(p2_75_train_url)
p2_test = pd.read_csv(p2_75_test_url)


p2_trainX = p2_train[contextual_features]
p2_testX = p2_test[contextual_features]

y_train = p2_train[["ResolvedDay"]]
y_test = p2_test[["ResolvedDay"]]

print("Train:", len(p2_trainX), "Test:", len(p2_testX))

print()
print('-------------RF Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_RF(p2_trainX, y_train, p2_testX, y_test, "P2")
print('-------------XGBoost Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_XGB(p2_trainX, y_train, p2_testX, y_test, "P2")


########75% lifetime prediction point ########
Train: 2143 Test: 486

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.5780703686758284 0.6297371565113501 0.5802469135802469
rmse: 8.963575124488314
mae: 6.938271604938271
[[135 153]
 [ 51 147]]
              precision    recall  f1-score   support

       False       0.73      0.47      0.57       288
        True       0.49      0.74      0.59       198

    accuracy                           0.58       486
   macro avg       0.61      0.61      0.58       486
weighted avg       0.63      0.58      0.58       486

-------------XGBoost Result ------------
[[145 143]
 [ 60 138]]
              precision    recall  f1-score   support

       False       0.71      0.50      0.59       288
        True       0.49      0.70      0.58       198

    accuracy                           0.58       486
   macro avg       0.60      0.60      0.58       486
weighted avg       0.62      0.58      0.58       486



# P2 bug reports

In [6]:
print('########25% lifetime prediction point ########')
p2_25_train_url = "./data/p2_25_train.csv"
p2_25_test_url = "./data/p2_25_test.csv"
p2_train = pd.read_csv(p2_75_train_url)
p2_test = pd.read_csv(p2_75_test_url)


p2_trainX = p2_train[contextual_features]
p2_testX = p2_test[contextual_features]

y_train = p2_train[["ResolvedDay"]]
y_test = p2_test[["ResolvedDay"]]

print("Train:", len(p2_trainX), "Test:", len(p2_testX))

print()
print('-------------RF Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_RF(p2_trainX, y_train, p2_testX, y_test, "P2")
print('-------------XGBoost Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_XGB(p2_trainX, y_train, p2_testX, y_test, "P2")

########25% lifetime prediction point ########
Train: 3181 Test: 799

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.7279642835066158 0.7254104805808194 0.7309136420525657
rmse: 19.62824074626893
mae: 13.97872340425532
[[493 101]
 [114  91]]
              precision    recall  f1-score   support

       False       0.81      0.83      0.82       594
        True       0.47      0.44      0.46       205

    accuracy                           0.73       799
   macro avg       0.64      0.64      0.64       799
weighted avg       0.73      0.73      0.73       799

-------------XGBoost Result ------------
[[508  86]
 [129  76]]
              precision    recall  f1-score   support

       False       0.80      0.86      0.83       594
        True       0.47      0.37      0.41       205

    accuracy                           0.73       799
   macro avg       0.63      0.61      0.62       799
weighted avg       0.71      0.73      0.72       799



In [7]:
print('########50% lifetime prediction point ########')
p2_50_train_url = "./data/p2_50_train.csv"
p2_50_test_url = "./data/p2_50_test.csv"
p2_train = pd.read_csv(p2_75_train_url)
p2_test = pd.read_csv(p2_75_test_url)


p2_trainX = p2_train[contextual_features]
p2_testX = p2_test[contextual_features]

y_train = p2_train[["ResolvedDay"]]
y_test = p2_test[["ResolvedDay"]]

print("Train:", len(p2_trainX), "Test:", len(p2_testX))

print()
print('-------------RF Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_RF(p2_trainX, y_train, p2_testX, y_test, "P2")
print('-------------XGBoost Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_XGB(p2_trainX, y_train, p2_testX, y_test, "P2")

########50% lifetime prediction point ########
Train: 3222 Test: 758

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.7269374734764401 0.7343249144943469 0.7216358839050132
rmse: 18.163142243228656
mae: 13.216358839050132
[[430 120]
 [ 91 117]]
              precision    recall  f1-score   support

       False       0.83      0.78      0.80       550
        True       0.49      0.56      0.53       208

    accuracy                           0.72       758
   macro avg       0.66      0.67      0.66       758
weighted avg       0.73      0.72      0.73       758

-------------XGBoost Result ------------
[[458  92]
 [105 103]]
              precision    recall  f1-score   support

       False       0.81      0.83      0.82       550
        True       0.53      0.50      0.51       208

    accuracy                           0.74       758
   macro avg       0.67      0.66      0.67       758
weighted avg       0.74      0.74      0.74       758



In [8]:
print('########75% lifetime prediction point ########')
p2_75_train_url = "./data/p2_75_train.csv"
p2_75_test_url = "./data/p2_75_test.csv"
p2_train = pd.read_csv(p2_75_train_url)
p2_test = pd.read_csv(p2_75_test_url)


p2_trainX = p2_train[contextual_features]
p2_testX = p2_test[contextual_features]

y_train = p2_train[["ResolvedDay"]]
y_test = p2_test[["ResolvedDay"]]

print("Train:", len(p2_trainX), "Test:", len(p2_testX))

print()
print('-------------RF Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_RF(p2_trainX, y_train, p2_testX, y_test, "P2")
print('-------------XGBoost Result ------------')
p2_f1, p2_precision, p2_recall = evaluate_XGB(p2_trainX, y_train, p2_testX, y_test, "P2")

########75% lifetime prediction point ########
Train: 3161 Test: 819

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.7164604942225136 0.7133235556082955 0.7203907203907204
rmse: 20.62775493126369
mae: 14.327228327228328
[[488 105]
 [124 102]]
              precision    recall  f1-score   support

       False       0.80      0.82      0.81       593
        True       0.49      0.45      0.47       226

    accuracy                           0.72       819
   macro avg       0.65      0.64      0.64       819
weighted avg       0.71      0.72      0.72       819

-------------XGBoost Result ------------
[[500  93]
 [129  97]]
              precision    recall  f1-score   support

       False       0.79      0.84      0.82       593
        True       0.51      0.43      0.47       226

    accuracy                           0.73       819
   macro avg       0.65      0.64      0.64       819
weighted avg       0.72      0.73      0.72       819



# P3 bug reports

In [9]:
print('########25% lifetime prediction point ########')
p3_25_train_url = "./data/p3_25_train.csv"
p3_25_test_url = "./data/p3_25_test.csv"

p3_train = pd.read_csv(p3_25_train_url)
p3_test = pd.read_csv(p3_25_test_url)

p3_trainX = p3_train[contextual_features]
p3_testX = p3_test[contextual_features]

p3_y_train = p3_train[["ResolvedDay"]]
p3_y_test = p3_test[["ResolvedDay"]]

print("Train:", len(p3_trainX), "Test:", len(p3_testX))

print()
print('-------------RF Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_RF(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")
print('-------------XGBoost Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_XGB(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")

########25% lifetime prediction point ########
Train: 1330 Test: 342

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.6173609378897479 0.613133971291866 0.6228070175438597
rmse: 25.885939645781242
mae: 18.953216374269005
[[172  59]
 [ 70  41]]
              precision    recall  f1-score   support

       False       0.71      0.74      0.73       231
        True       0.41      0.37      0.39       111

    accuracy                           0.62       342
   macro avg       0.56      0.56      0.56       342
weighted avg       0.61      0.62      0.62       342

-------------XGBoost Result ------------
[[193  38]
 [ 71  40]]
              precision    recall  f1-score   support

       False       0.73      0.84      0.78       231
        True       0.51      0.36      0.42       111

    accuracy                           0.68       342
   macro avg       0.62      0.60      0.60       342
weighted avg       0.66      0.68      0.66       342



In [10]:
print('########50% lifetime prediction point ########')
p3_50_train_url = "./data/p3_50_train.csv"
p3_50_test_url = "./data/p3_50_test.csv"

p3_train = pd.read_csv(p3_50_train_url)
p3_test = pd.read_csv(p3_50_test_url)

p3_trainX = p3_train[contextual_features]
p3_testX = p3_test[contextual_features]

p3_y_train = p3_train[["ResolvedDay"]]
p3_y_test = p3_test[["ResolvedDay"]]

print("Train:", len(p3_trainX), "Test:", len(p3_testX))

print()
print('-------------RF Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_RF(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")
print('-------------XGBoost Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_XGB(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")

########50% lifetime prediction point ########
Train: 1328 Test: 344

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.6814075566135629 0.6763724388346157 0.6976744186046512
rmse: 29.077732629689695
mae: 20.40988372093023
[[202  37]
 [ 67  38]]
              precision    recall  f1-score   support

       False       0.75      0.85      0.80       239
        True       0.51      0.36      0.42       105

    accuracy                           0.70       344
   macro avg       0.63      0.60      0.61       344
weighted avg       0.68      0.70      0.68       344

-------------XGBoost Result ------------
[[199  40]
 [ 76  29]]
              precision    recall  f1-score   support

       False       0.72      0.83      0.77       239
        True       0.42      0.28      0.33       105

    accuracy                           0.66       344
   macro avg       0.57      0.55      0.55       344
weighted avg       0.63      0.66      0.64       344



In [11]:
print('########75% lifetime prediction point ########')
p3_75_train_url = "./data/p3_75_train.csv"
p3_75_test_url = "./data/p3_75_test.csv"

p3_train = pd.read_csv(p3_75_train_url)
p3_test = pd.read_csv(p3_75_test_url)

p3_trainX = p3_train[contextual_features]
p3_testX = p3_test[contextual_features]

p3_y_train = p3_train[["ResolvedDay"]]
p3_y_test = p3_test[["ResolvedDay"]]

print("Train:", len(p3_trainX), "Test:", len(p3_testX))

print()
print('-------------RF Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_RF(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")
print('-------------XGBoost Result ------------')
p3_f1, p3_precision, p3_recall = evaluate_XGB(p3_trainX, p3_y_train, p3_testX, p3_y_test, "P3")

########75% lifetime prediction point ########
Train: 1363 Test: 309

-------------RF Result ------------


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.6941478088490307 0.6889674439189003 0.7022653721682848
rmse: 26.67119702617529
mae: 18.93851132686084
[[181  39]
 [ 53  36]]
              precision    recall  f1-score   support

       False       0.77      0.82      0.80       220
        True       0.48      0.40      0.44        89

    accuracy                           0.70       309
   macro avg       0.63      0.61      0.62       309
weighted avg       0.69      0.70      0.69       309

-------------XGBoost Result ------------
[[181  39]
 [ 61  28]]
              precision    recall  f1-score   support

       False       0.75      0.82      0.78       220
        True       0.42      0.31      0.36        89

    accuracy                           0.68       309
   macro avg       0.58      0.57      0.57       309
weighted avg       0.65      0.68      0.66       309

