In [62]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import jellyfish
import os
import shutil
import subprocess
import requests
from github import Github
from git import Repo
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from zipfile import ZipFile
from filecmp import dircmp
import configparser
import h2o

## Result Gathering

In [63]:
main_df = None
directory = "Effort_Estimation_Results_v3/"
for filename in os.listdir(directory):
    print(filename)
    if main_df is None:
        main_df = pd.read_csv(os.path.join(directory, filename))
        main_df['project_name'] = filename.replace('.csv', '')
    else:
        temp_df = pd.read_csv(os.path.join(directory, filename))
        temp_df['project_name'] = filename.replace('.csv', '')
        frames = [main_df, temp_df]
        main_df = pd.concat(frames)

Alluxio.csv
Assertj-core.csv
Atmosphere.csv
AxonFramework.csv
Beam.csv
Byte-buddy.csv
Camel.csv
Cxf.csv
Dbeaver.csv
Hadoop.csv
Okhttp.csv
Redisson.csv


In [64]:
main_df.head()

Unnamed: 0,sha,name,email,date,login,message,parent_sha,parent_date,time_taken,contains_refactoring,...,num_dependency_max,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std,actual_num_of_classes_touched_mean,actual_num_of_classes_touched_min,actual_num_of_classes_touched_max,actual_num_of_classes_touched_std
0,6ad1e4fe77445e8689f6d3975b26e52165c9c3e6,Bin Fan,fanbin103@gmail.com,2021-08-04 02:49:16,apc999,Fix IndexOutOfBoundsException on async cache\n\n### What changes a...,df5dcab8bc308dfd2bf650a895865b13120a9866,2021-08-03 16:07:28,10.0,1,...,322.0,,764.0,764,764,,9.0,9,9,
1,8647c6162423b851dda8d10edf4686473d2e95cc,Zac Blanco,zac@alluxio.com,2021-07-15 21:54:27,ZacBlanco,Update and improve conformance of S3 API\n\n### What changes are p...,79a5e5c78b7dfcdbf8edbd928a2ff59c904d08f8,2021-07-15 18:38:50,3.0,1,...,15.0,,275.0,275,275,,6.0,6,6,
2,3ce52983e6f50bfb7880b5a2cb13a18e4272170b,Zhan Yuan,yuanzhanhku@gmail.com,2021-07-12 06:39:42,yuanzhanhku,Add CacheContext to URIStatus to enable per-read metrics\n\nAdd fu...,cf79c7837c57c83c1b99a1dab53fec25deb2069d,2021-07-10 20:58:13,33.0,1,...,371.0,0.0,115.0,115,115,0.0,4.5,3,6,2.12132
3,a4dc54f7dc0333da096aa779dbaa79060c90d1ad,kqhzz,kuangqinghuazz@163.com,2021-06-09 18:09:26,kuszz,Deprecate leader command\n\nFix #13512\n\npr-link: Alluxio/alluxio...,1356c0b35fd753d1081665b2c8f6b25da62bf2e8,2021-06-09 02:31:45,15.0,1,...,85.0,,61.0,61,61,,2.0,2,2,
4,7fb84094a6075bcef5a103b40adcac4b26b724ca,Jiacheng Liu,jiacheliu3@gmail.com,2021-05-29 00:37:05,jiacheliu3,Refactor MasterWorkerInfo\n\nCurrently all the worker metadata are...,220237085593d731756e24249bcf88a4d6ea5710,2021-05-28 23:28:46,1.0,1,...,177.0,93.543216,458.666667,24,1245,682.248,35.0,1,54,29.512709


In [65]:
len(main_df)

366

In [66]:
def get_final_time_taken(row):
    if row['time_taken'] > row['mean']:
        return row['time_taken'] - row['mean']
    else:
        return row['time_taken']
        #if (row['time_taken'] - row['min']) == 0:
        #    return 1
        #else:
        #    return row['time_taken'] - row['min']

In [67]:
main_df['time_taken_final'] = main_df.apply(lambda row: get_final_time_taken(row), axis=1)



In [68]:
#main_df = main_df[main_df['time_taken_final'] > 1]

In [69]:
main_df['time_taken_final'].value_counts()

1.000000     68
2.000000     32
3.000000     31
4.000000     18
8.000000     10
             ..
24.870690     1
21.169082     1
0.186047      1
8.566667      1
24.196653     1
Name: time_taken_final, Length: 143, dtype: int64

In [70]:
main_df = main_df.fillna(0)

## EDA (Run only if absolutely necessary on minimum features)

In [71]:
#from pandas_profiling import ProfileReport

#profile = ProfileReport(main_df, title="Pandas Profiling Report",explorative=True)
#profile

In [72]:
final_col = []
column_to_skip = ['sha',
'name','email','login',
'date','message',
'parent_sha','parent_date',
'time_taken','contains_refactoring',
'kmean_label','mean',
'min','max',
'project_name','commit_compared_with','actual_num_of_classes_touched_mean',
 'actual_num_of_classes_touched_min',
 'actual_num_of_classes_touched_max',
 'actual_num_of_classes_touched_std']
for element in main_df.columns:
    #print("'" + element + "',")
    if element not in column_to_skip:
        final_col.append(element)

In [73]:
final_col

['cbo_mean',
 'cbo_min',
 'cbo_max',
 'cbo_std',
 'wmc_mean',
 'wmc_min',
 'wmc_max',
 'wmc_std',
 'dit_mean',
 'dit_min',
 'dit_max',
 'dit_std',
 'rfc_mean',
 'rfc_min',
 'rfc_max',
 'rfc_std',
 'lcom_mean',
 'lcom_min',
 'lcom_max',
 'lcom_std',
 'totalMethods_mean',
 'totalMethods_min',
 'totalMethods_max',
 'totalMethods_std',
 'staticMethods_mean',
 'staticMethods_min',
 'staticMethods_max',
 'staticMethods_std',
 'publicMethods_mean',
 'publicMethods_min',
 'publicMethods_max',
 'publicMethods_std',
 'privateMethods_mean',
 'privateMethods_min',
 'privateMethods_max',
 'privateMethods_std',
 'protectedMethods_mean',
 'protectedMethods_min',
 'protectedMethods_max',
 'protectedMethods_std',
 'defaultMethods_mean',
 'defaultMethods_min',
 'defaultMethods_max',
 'defaultMethods_std',
 'abstractMethods_mean',
 'abstractMethods_min',
 'abstractMethods_max',
 'abstractMethods_std',
 'finalMethods_mean',
 'finalMethods_min',
 'finalMethods_max',
 'finalMethods_std',
 'synchronizedMetho

In [74]:
X_col = final_col.copy()
X_col.remove('time_taken_final')
Y_col = ['time_taken_final']

X = main_df[X_col]
y = main_df[Y_col]
final_df = main_df[final_col]

In [75]:
X.head()

Unnamed: 0,cbo_mean,cbo_min,cbo_max,cbo_std,wmc_mean,wmc_min,wmc_max,wmc_std,dit_mean,dit_min,...,modifiers_max,modifiers_std,num_dependency_mean,num_dependency_min,num_dependency_max,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std
0,38.0,38.0,38.0,0.0,57.0,57.0,57.0,0.0,1.0,1.0,...,17.0,0.0,322.0,322.0,322.0,0.0,764.0,764,764,0.0
1,12.0,12.0,12.0,0.0,17.0,17.0,17.0,0.0,1.0,1.0,...,1.0,0.0,15.0,15.0,15.0,0.0,275.0,275,275,0.0
2,4.0,4.0,4.0,0.0,15.0,15.0,15.0,0.0,1.0,1.0,...,1.0,0.0,371.0,371.0,371.0,0.0,115.0,115,115,0.0
3,11.0,11.0,11.0,0.0,8.0,8.0,8.0,0.0,2.0,2.0,...,17.0,0.0,85.0,85.0,85.0,0.0,61.0,61,61,0.0
4,27.666667,3.0,76.0,41.860881,50.0,1.0,140.0,78.044859,1.666667,1.0,...,1.0,0.0,69.333333,8.0,177.0,93.543216,458.666667,24,1245,682.248


In [76]:
y.head()

Unnamed: 0,time_taken_final
0,10.0
1,3.0
2,33.0
3,15.0
4,1.0


In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

In [78]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,5 hours 59 mins
H2O_cluster_timezone:,Asia/Singapore
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.7
H2O_cluster_version_age:,2 months and 18 days
H2O_cluster_name:,H2O_from_python_tanji_xsl51k
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.973 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [79]:
from h2o.automl import H2OAutoML

full = h2o.H2OFrame(final_df)
train,test = full.split_frame(ratios=[.7])


x = train.columns
y = "time_taken_final"
x.remove(y)



Parse progress: |█████████████████████████████████████████████████████████| 100%


In [81]:
aml = H2OAutoML(max_models=10, seed=1, exclude_algos=['XGBoost'])
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [82]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_5_AutoML_20211120_201618,823.065,28.6891,823.065,18.0235,1.42495
GLM_1_AutoML_20211120_201618,835.71,28.9086,835.71,18.4398,
StackedEnsemble_AllModels_AutoML_20211120_201618,848.514,29.1293,848.514,18.494,1.44679
StackedEnsemble_BestOfFamily_AutoML_20211120_201618,852.932,29.205,852.932,18.4066,1.43833
DRF_1_AutoML_20211120_201618,856.799,29.2711,856.799,18.902,1.44146
XRT_1_AutoML_20211120_201618,911.586,30.1925,911.586,18.9933,1.45142
GBM_grid__1_AutoML_20211120_201618_model_1,922.142,30.3668,922.142,18.9794,1.46333
GBM_3_AutoML_20211120_201618,923.909,30.3959,923.909,19.2703,
GBM_1_AutoML_20211120_201618,929.438,30.4867,929.438,18.4048,1.42081
GBM_4_AutoML_20211120_201618,941.078,30.677,941.078,19.6341,




In [83]:
model = h2o.get_model(lb[0,'model_id'])

In [84]:
model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_5_AutoML_20211120_201618


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,30.0,30.0,2507.0,1.0,1.0,1.0,2.0,2.0,2.0




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 751.1100173697076
RMSE: 27.406386433999423
MAE: 17.117785311440233
RMSLE: 1.3777320440717102
Mean Residual Deviance: 751.1100173697076

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 823.0654806061539
RMSE: 28.689117808084546
MAE: 18.02347976159421
RMSLE: 1.4249532391009074
Mean Residual Deviance: 823.0654806061539

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,18.03776,1.7666873,16.435223,17.860342,16.204998,19.855808,19.832422
1,mean_residual_deviance,824.82007,238.95428,822.90375,666.7841,540.86163,1156.9045,936.64636
2,mse,824.82007,238.95428,822.90375,666.7841,540.86163,1156.9045,936.64636
3,r2,0.011332163,0.045982882,0.07267657,0.028098386,-0.051368017,0.017106667,-0.00985279
4,residual_deviance,824.82007,238.95428,822.90375,666.7841,540.86163,1156.9045,936.64636
5,rmse,28.476574,4.1690497,28.686298,25.822163,23.256433,34.013298,30.60468
6,rmsle,1.4221816,0.09815485,1.2678881,1.4971406,1.5127703,1.4370788,1.39603



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-11-20 20:16:25,0.164 sec,0.0,29.020642,18.44456,842.197678
1,,2021-11-20 20:16:25,0.173 sec,5.0,28.461431,18.039844,810.053053
2,,2021-11-20 20:16:25,0.181 sec,10.0,28.179901,17.841697,794.106811
3,,2021-11-20 20:16:25,0.190 sec,15.0,27.930612,17.603852,780.119106
4,,2021-11-20 20:16:25,0.199 sec,20.0,27.719773,17.500002,768.385839
5,,2021-11-20 20:16:25,0.207 sec,25.0,27.560207,17.281013,759.565012
6,,2021-11-20 20:16:25,0.216 sec,30.0,27.406386,17.117785,751.110017



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,wmc_max,28302.359375,1.0,0.244134
1,nosi_mean,15794.744141,0.558072,0.136245
2,loc_max,11730.114258,0.414457,0.101183
3,publicMethods_mean,10488.011719,0.37057,0.090469
4,totalFields_min,9722.749023,0.343531,0.083868
5,staticFields_max,7762.935547,0.274286,0.066963
6,nosi_min,6692.943359,0.23648,0.057733
7,staticFields_mean,5800.258789,0.204939,0.050033
8,cbo_max,5116.099121,0.180766,0.044131
9,privateFields_mean,4261.756836,0.15058,0.036762



See the whole table with table.as_data_frame()




In [85]:
preds = model.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [86]:
preds

predict
23.9057
5.38745
10.4748
19.3419
26.8317
16.743
26.0237
22.9408
19.8998
12.1161




In [87]:
# save the model
model_path = h2o.save_model(model=model, path="models/EffortEstimationModelv2", force=True)

In [64]:
#aml.explain(test)

In [65]:
#aml.explain_row(test, row_index=0)

In [66]:
#h2o.shutdown()