In [1]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import jellyfish
import os
import shutil
import subprocess
import requests
from github import Github
from git import Repo
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from zipfile import ZipFile
from filecmp import dircmp
import configparser
import h2o



## Result Gathering

In [2]:
main_df = None
directory = "Effort_Estimation_Results/"
for filename in os.listdir(directory):
    print(filename)
    if main_df is None:
        main_df = pd.read_csv(os.path.join(directory, filename))
        main_df['project_name'] = filename.replace('.csv', '')
    else:
        temp_df = pd.read_csv(os.path.join(directory, filename))
        temp_df['project_name'] = filename.replace('.csv', '')
        frames = [main_df, temp_df]
        main_df = pd.concat(frames)

Alluxio.csv
Assertj-core.csv
Atmosphere.csv
AxonFramework.csv
Beam.csv
Byte-buddy.csv
Camel.csv
Cassandra.csv
Cxf.csv
Dbeaver.csv
Hadoop.csv
karaf.csv
Okhttp.csv
Openapi-generator.csv
Orientdb.csv
Redisson.csv
Spotbugs.csv
Spring-framework.csv
Spring-security.csv
Storm.csv
Testcontainers-java.csv
Traccar.csv


In [3]:
main_df.head()

Unnamed: 0,latest_commit,commit_compared_with,num_dependency,num_line_affected,actual_num_of_classes_touched,latest_commit_date,commit_compared_with_date,project_name
0,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,6d501cabf03a0af22f13ca88a9c791ec21edaaeb,632.0,35,4,2021-08-04 22:21:34,2021-06-07 19:51:51,Alluxio
1,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,85724a0b2c9114d94e2b9c6ffe2112bb319f692b,25.0,44,3,2021-08-04 22:21:34,2021-06-08 15:53:29,Alluxio
2,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,a4dc54f7dc0333da096aa779dbaa79060c90d1ad,255.0,118,2,2021-08-04 22:21:34,2021-06-09 18:09:26,Alluxio
3,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,f9f22097ce44cafe15bfeb6ec56cbf1c7e8659b3,166.0,144,5,2021-08-04 22:21:34,2021-06-14 00:22:17,Alluxio
4,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,c9b564fe0e2e928b3b8583d8778b58031fe279fa,19589.0,288,37,2021-08-04 22:21:34,2021-06-14 00:23:51,Alluxio


In [4]:
len(main_df)

15636

In [5]:
main_df['latest_commit_date'] = pd.to_datetime(main_df['latest_commit_date'])
main_df['commit_compared_with_date'] = pd.to_datetime(main_df['commit_compared_with_date'])
#main_df['time_taken_days'] = (main_df['latest_commit_date'] - main_df['commit_compared_with_date']).dt.round("D")
main_df['time_taken_days'] = (main_df['latest_commit_date'] - main_df['commit_compared_with_date']).dt.days


In [6]:
main_df.head(20)

Unnamed: 0,latest_commit,commit_compared_with,num_dependency,num_line_affected,actual_num_of_classes_touched,latest_commit_date,commit_compared_with_date,project_name,time_taken_days
0,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,6d501cabf03a0af22f13ca88a9c791ec21edaaeb,632.0,35,4,2021-08-04 22:21:34,2021-06-07 19:51:51,Alluxio,58
1,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,85724a0b2c9114d94e2b9c6ffe2112bb319f692b,25.0,44,3,2021-08-04 22:21:34,2021-06-08 15:53:29,Alluxio,57
2,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,a4dc54f7dc0333da096aa779dbaa79060c90d1ad,255.0,118,2,2021-08-04 22:21:34,2021-06-09 18:09:26,Alluxio,56
3,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,f9f22097ce44cafe15bfeb6ec56cbf1c7e8659b3,166.0,144,5,2021-08-04 22:21:34,2021-06-14 00:22:17,Alluxio,51
4,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,c9b564fe0e2e928b3b8583d8778b58031fe279fa,19589.0,288,37,2021-08-04 22:21:34,2021-06-14 00:23:51,Alluxio,51
5,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,62265d1d451e652c828c41b5f4c17dee81180fa4,1088.0,277,29,2021-08-04 22:21:34,2021-06-14 23:27:24,Alluxio,50
6,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,0abc4df4926166a6f179a9c1d42c3c510fad558d,9043.0,713,48,2021-08-04 22:21:34,2021-06-14 23:43:17,Alluxio,50
7,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,e254405ef62fb364d8d4d2f795d5d685f260a12e,16.0,19,8,2021-08-04 22:21:34,2021-06-15 03:00:36,Alluxio,50
8,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,62e06ed79d09aaac54ec9b1be6d2e75536e26616,32524.0,862,95,2021-08-04 22:21:34,2021-06-16 05:23:04,Alluxio,49
9,4f1d962767c5f4c37d572b4ce3f1e07cc357474a,61afe3d6bb7f2249ad8f42bb42dd99984494c2f8,1192.0,14,2,2021-08-04 22:21:34,2021-06-17 20:30:20,Alluxio,48


In [12]:
# Aggregating all changes per release


test_df = main_df.groupby(['latest_commit', 'project_name']).agg({'num_dependency':'sum',
                                                                  'num_line_affected':'sum',
                                                                  'actual_num_of_classes_touched':'sum',
                                                                  'time_taken_days':'max',}).reset_index()

test_df = test_df[test_df['time_taken_days'] > 0]
test_df = test_df[test_df['time_taken_days'] < 120]

In [13]:
test_df.columns

Index(['latest_commit', 'project_name', 'num_dependency', 'num_line_affected',
       'actual_num_of_classes_touched', 'time_taken_days'],
      dtype='object')

In [14]:
len(test_df)

98

In [15]:
test_df.head(20)

Unnamed: 0,latest_commit,project_name,num_dependency,num_line_affected,actual_num_of_classes_touched,time_taken_days
1,0372e776e7079407b932c4bb21dc2557ca194333,Dbeaver,37203.0,7608,312,16
3,073dfc26c7a065f5d5abf18be8cce8258a9aaa71,Dbeaver,39379.0,4157,287,23
6,08a06a7693a31bca7b66f396ba6555cbd9d63a49,Redisson,2491.0,551,48,26
7,08da840cb6d3392e69870dfdc2c866a6f86577fd,Dbeaver,18579.0,2250,85,10
8,094f8a7c3ddcc9e2a88f3993bad8c2d31f30388c,Redisson,15669.0,278,41,47
9,0ca7f6edd5d76f0a0b60ba534f0e53e81f2de80d,Assertj-core,2727.0,334,17,46
10,0ddefe0ee95d954ef87991f9d5fbfa531b046d14,Dbeaver,53301.0,10829,547,17
12,113a0a672f277a6e8181757a0c54f92d42f98ef9,Dbeaver,30980.0,5638,253,13
13,1541a742d1fa2633ee2251eeea72904f3447c8f5,Camel,207934.0,70598,3322,82
14,170dc5fd5c8cdc41ca89497e377fee71832fd38d,Dbeaver,16751.0,3114,219,16


## EDA

In [16]:
from pandas_profiling import ProfileReport

profile = ProfileReport(test_df, title="Pandas Profiling Report",explorative=True)
profile

Summarize dataset:   0%|          | 0/20 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [17]:
test_df.head()

Unnamed: 0,latest_commit,project_name,num_dependency,num_line_affected,actual_num_of_classes_touched,time_taken_days
1,0372e776e7079407b932c4bb21dc2557ca194333,Dbeaver,37203.0,7608,312,16
3,073dfc26c7a065f5d5abf18be8cce8258a9aaa71,Dbeaver,39379.0,4157,287,23
6,08a06a7693a31bca7b66f396ba6555cbd9d63a49,Redisson,2491.0,551,48,26
7,08da840cb6d3392e69870dfdc2c866a6f86577fd,Dbeaver,18579.0,2250,85,10
8,094f8a7c3ddcc9e2a88f3993bad8c2d31f30388c,Redisson,15669.0,278,41,47


In [18]:
final_col = ['num_dependency', 'num_line_affected', 'actual_num_of_classes_touched', 'time_taken_days']

X_col = ['num_dependency', 'num_line_affected', 'actual_num_of_classes_touched']
Y_col = ['time_taken_days']

X = test_df[X_col]
y = test_df[Y_col]
final_df = test_df[final_col]

In [19]:
X.head()

Unnamed: 0,num_dependency,num_line_affected,actual_num_of_classes_touched
1,37203.0,7608,312
3,39379.0,4157,287
6,2491.0,551,48
7,18579.0,2250,85
8,15669.0,278,41


In [20]:
y.head()

Unnamed: 0,time_taken_days
1,16
3,23
6,26
7,10
8,47


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

In [24]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,12 mins 41 secs
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.4
H2O_cluster_version_age:,6 months and 28 days !!!
H2O_cluster_name:,H2O_from_python_tanji_fplsq9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.944 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [25]:
from h2o.automl import H2OAutoML

train = h2o.H2OFrame(final_df)
x = train.columns
y = "time_taken_days"
x.remove(y)

aml = H2OAutoML(max_models=10, seed=1)
aml.train(x=x, y=y, training_frame=train)

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |█
06:56:42.705: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████| 100%

06:56:48.771: Skipping training of model GBM_5_AutoML_20210830_065642 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20210830_065642.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 98.0.




In [26]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_3_AutoML_20210830_065642,899.137,29.9856,899.137,23.1582,0.98307
DeepLearning_1_AutoML_20210830_065642,912.524,30.208,912.524,23.5553,0.94925
GBM_4_AutoML_20210830_065642,913.175,30.2188,913.175,23.2235,0.98961
GBM_2_AutoML_20210830_065642,936.412,30.6009,936.412,23.9592,1.00735
XRT_1_AutoML_20210830_065642,937.255,30.6146,937.255,22.8947,0.988558
StackedEnsemble_AllModels_AutoML_20210830_065642,957.452,30.9427,957.452,25.1154,1.01534
GBM_grid__1_AutoML_20210830_065642_model_1,982.948,31.352,982.948,23.3807,1.04396
StackedEnsemble_BestOfFamily_AutoML_20210830_065642,990.277,31.4687,990.277,25.1561,1.01572
GLM_1_AutoML_20210830_065642,1002.74,31.666,1002.74,25.9023,1.01542
DRF_1_AutoML_20210830_065642,1026.36,32.0369,1026.36,23.9315,1.03144


