In [25]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import jellyfish
import os
import shutil
import subprocess
import requests
from github import Github
from git import Repo
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from zipfile import ZipFile
from filecmp import dircmp
import configparser
import h2o

## Result Gathering

In [77]:
main_df = None
directory = "Effort_Estimation_Results_v3/"
for filename in os.listdir(directory):
    print(filename)
    if main_df is None:
        main_df = pd.read_csv(os.path.join(directory, filename))
        main_df['project_name'] = filename.replace('.csv', '')
    else:
        temp_df = pd.read_csv(os.path.join(directory, filename))
        temp_df['project_name'] = filename.replace('.csv', '')
        frames = [main_df, temp_df]
        main_df = pd.concat(frames)

Alluxio.csv
Assertj-core.csv
Atmosphere.csv
AxonFramework.csv
Beam.csv
Byte-buddy.csv
Camel.csv
Cxf.csv
Dbeaver.csv
Hadoop.csv
Okhttp.csv
Redisson.csv


In [78]:
main_df.head()

Unnamed: 0,sha,name,email,date,login,message,parent_sha,parent_date,time_taken,contains_refactoring,...,num_dependency_max,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std,actual_num_of_classes_touched_mean,actual_num_of_classes_touched_min,actual_num_of_classes_touched_max,actual_num_of_classes_touched_std
0,6ad1e4fe77445e8689f6d3975b26e52165c9c3e6,Bin Fan,fanbin103@gmail.com,2021-08-04 02:49:16,apc999,Fix IndexOutOfBoundsException on async cache\n...,df5dcab8bc308dfd2bf650a895865b13120a9866,2021-08-03 16:07:28,10.0,1,...,322.0,,764.0,764,764,,9.0,9,9,
1,8647c6162423b851dda8d10edf4686473d2e95cc,Zac Blanco,zac@alluxio.com,2021-07-15 21:54:27,ZacBlanco,Update and improve conformance of S3 API\n\n##...,79a5e5c78b7dfcdbf8edbd928a2ff59c904d08f8,2021-07-15 18:38:50,3.0,1,...,15.0,,275.0,275,275,,6.0,6,6,
2,3ce52983e6f50bfb7880b5a2cb13a18e4272170b,Zhan Yuan,yuanzhanhku@gmail.com,2021-07-12 06:39:42,yuanzhanhku,Add CacheContext to URIStatus to enable per-re...,cf79c7837c57c83c1b99a1dab53fec25deb2069d,2021-07-10 20:58:13,33.0,1,...,371.0,0.0,115.0,115,115,0.0,4.5,3,6,2.12132
3,a4dc54f7dc0333da096aa779dbaa79060c90d1ad,kqhzz,kuangqinghuazz@163.com,2021-06-09 18:09:26,kuszz,Deprecate leader command\n\nFix #13512\n\npr-l...,1356c0b35fd753d1081665b2c8f6b25da62bf2e8,2021-06-09 02:31:45,15.0,1,...,85.0,,61.0,61,61,,2.0,2,2,
4,7fb84094a6075bcef5a103b40adcac4b26b724ca,Jiacheng Liu,jiacheliu3@gmail.com,2021-05-29 00:37:05,jiacheliu3,Refactor MasterWorkerInfo\n\nCurrently all the...,220237085593d731756e24249bcf88a4d6ea5710,2021-05-28 23:28:46,1.0,1,...,177.0,93.543216,458.666667,24,1245,682.248,35.0,1,54,29.512709


In [79]:
len(main_df)

366

In [80]:
def get_final_time_taken(row):
    if row['time_taken'] > row['mean']:
        return row['time_taken'] - row['mean']
    else:
        return row['time_taken']
        #if (row['time_taken'] - row['min']) == 0:
        #    return 1
        #else:
        #    return row['time_taken'] - row['min']

In [81]:
main_df['time_taken_final'] = main_df.apply(lambda row: get_final_time_taken(row), axis=1)



In [82]:
main_df['time_taken_final'].value_counts()

1.000000     68
2.000000     32
3.000000     31
4.000000     18
8.000000     10
             ..
24.870690     1
21.169082     1
0.186047      1
8.566667      1
24.196653     1
Name: time_taken_final, Length: 143, dtype: int64

In [83]:
main_df = main_df.fillna(0)

## EDA (Run only if absolutely necessary on minimum features)

In [84]:
#from pandas_profiling import ProfileReport

#profile = ProfileReport(main_df, title="Pandas Profiling Report",explorative=True)
#profile

In [85]:
final_col = []
column_to_skip = ['sha',
'name','email','login',
'date','message',
'parent_sha','parent_date',
'time_taken','contains_refactoring',
'kmean_label','mean',
'min','max',
'project_name','commit_compared_with',]
for element in main_df.columns:
    #print("'" + element + "',")
    if element not in column_to_skip:
        final_col.append(element)

In [87]:
final_col

['cbo_mean',
 'cbo_min',
 'cbo_max',
 'cbo_std',
 'wmc_mean',
 'wmc_min',
 'wmc_max',
 'wmc_std',
 'dit_mean',
 'dit_min',
 'dit_max',
 'dit_std',
 'rfc_mean',
 'rfc_min',
 'rfc_max',
 'rfc_std',
 'lcom_mean',
 'lcom_min',
 'lcom_max',
 'lcom_std',
 'totalMethods_mean',
 'totalMethods_min',
 'totalMethods_max',
 'totalMethods_std',
 'staticMethods_mean',
 'staticMethods_min',
 'staticMethods_max',
 'staticMethods_std',
 'publicMethods_mean',
 'publicMethods_min',
 'publicMethods_max',
 'publicMethods_std',
 'privateMethods_mean',
 'privateMethods_min',
 'privateMethods_max',
 'privateMethods_std',
 'protectedMethods_mean',
 'protectedMethods_min',
 'protectedMethods_max',
 'protectedMethods_std',
 'defaultMethods_mean',
 'defaultMethods_min',
 'defaultMethods_max',
 'defaultMethods_std',
 'abstractMethods_mean',
 'abstractMethods_min',
 'abstractMethods_max',
 'abstractMethods_std',
 'finalMethods_mean',
 'finalMethods_min',
 'finalMethods_max',
 'finalMethods_std',
 'synchronizedMetho

In [88]:
X_col = final_col.copy()
X_col.remove('time_taken_final')
Y_col = ['time_taken_final']

X = main_df[X_col]
y = main_df[Y_col]
final_df = main_df[final_col]

In [89]:
X.head()

Unnamed: 0,cbo_mean,cbo_min,cbo_max,cbo_std,wmc_mean,wmc_min,wmc_max,wmc_std,dit_mean,dit_min,...,num_dependency_max,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std,actual_num_of_classes_touched_mean,actual_num_of_classes_touched_min,actual_num_of_classes_touched_max,actual_num_of_classes_touched_std
0,38.0,38.0,38.0,0.0,57.0,57.0,57.0,0.0,1.0,1.0,...,322.0,0.0,764.0,764,764,0.0,9.0,9,9,0.0
1,12.0,12.0,12.0,0.0,17.0,17.0,17.0,0.0,1.0,1.0,...,15.0,0.0,275.0,275,275,0.0,6.0,6,6,0.0
2,4.0,4.0,4.0,0.0,15.0,15.0,15.0,0.0,1.0,1.0,...,371.0,0.0,115.0,115,115,0.0,4.5,3,6,2.12132
3,11.0,11.0,11.0,0.0,8.0,8.0,8.0,0.0,2.0,2.0,...,85.0,0.0,61.0,61,61,0.0,2.0,2,2,0.0
4,27.666667,3.0,76.0,41.860881,50.0,1.0,140.0,78.044859,1.666667,1.0,...,177.0,93.543216,458.666667,24,1245,682.248,35.0,1,54,29.512709


In [90]:
y.head()

Unnamed: 0,time_taken_final
0,10.0
1,3.0
2,33.0
3,15.0
4,1.0


In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

In [92]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 hours 9 mins
H2O_cluster_timezone:,Asia/Singapore
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.7
H2O_cluster_version_age:,2 months and 12 days
H2O_cluster_name:,H2O_from_python_tanji_zpsor2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.875 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [93]:
from h2o.automl import H2OAutoML

full = h2o.H2OFrame(final_df)
train,test = full.split_frame(ratios=[.7])


x = train.columns
y = "time_taken_final"
x.remove(y)



In [94]:
aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=train)

Failed polling AutoML progress log: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\tanji\\AppData\\Local\\Temp\\tmpbqg3_vkj.csv'

12:00:00.857: AutoML: XGBoost is not available; skipping it.



In [95]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_AutoML_20211115_120000,884.534,29.7411,884.534,19.7627,1.4677
GLM_1_AutoML_20211115_120000,900.905,30.0151,900.905,20.0301,
StackedEnsemble_AllModels_AutoML_20211115_120000,907.995,30.133,907.995,20.2176,
GBM_5_AutoML_20211115_120000,915.922,30.2642,915.922,20.0443,1.49926
GBM_3_AutoML_20211115_120000,917.988,30.2983,917.988,20.0157,
GBM_4_AutoML_20211115_120000,918.462,30.3061,918.462,20.3826,
XRT_1_AutoML_20211115_120000,924.058,30.3983,924.058,21.0509,1.5096
GBM_2_AutoML_20211115_120000,931.792,30.5253,931.792,20.4936,
DRF_1_AutoML_20211115_120000,951.437,30.8454,951.437,21.1808,1.5147
GBM_1_AutoML_20211115_120000,975.922,31.2397,975.922,20.507,1.4875




In [96]:
model = h2o.get_model(lb[1,'model_id'])

In [97]:
model.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,maxNestedBlocks_mean,0.268126,1.000000,0.020743
1,maxNestedBlocks_max,0.264675,0.987131,0.020476
2,anonymousClassesQty_min,0.234209,0.873504,0.018119
3,totalMethods_min,0.227206,0.847386,0.017577
4,anonymousClassesQty_mean,0.217246,0.810241,0.016807
...,...,...,...,...
158,variablesQty_max,0.007278,0.027143,0.000563
159,rfc_std,0.004277,0.015950,0.000331
160,parenthesizedExpsQty_mean,0.003448,0.012861,0.000267
161,parenthesizedExpsQty_min,0.001776,0.006622,0.000137


In [98]:
preds = aml.leader.predict(test)

In [99]:
preds

predict
19.2018
17.5772
11.9613
17.1317
23.4177
12.2647
27.8566
13.5703
26.7723
17.758




In [64]:
#aml.explain(test)

In [65]:
#aml.explain_row(test, row_index=0)

In [66]:
#h2o.shutdown()