In [5]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import os
import shutil
import subprocess
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from zipfile import ZipFile
from filecmp import dircmp
import h2o

In [6]:
project_name = 'dropwizard'
version_name = 'v2.0.6'

n_cluster_divisible_arr = [5,7,10,15,20,25]
affinity_arr = ['euclidean', 'manhattan', 'cosine']
linkage_arr = ['complete', 'average', 'single']

n_cluster = 300
affinity = 'cosine'
linkage = 'single'

In [7]:
depends_dir = f'C:/Users/tanji/Desktop/SoftwareRemodularization/raw_depends/{project_name}/{project_name}_{version_name}.json'
rootdir = f'C:/Users/tanji/Desktop/SoftwareRemodularization/raw_sourcecode/{project_name}/{project_name}_{version_name}'

In [8]:
print(rootdir)

C:/Users/tanji/Desktop/SoftwareRemodularization/raw_sourcecode/dropwizard/dropwizard_v2.0.6


In [9]:
full_dir_arr = []
for root, dirs, files in os.walk(rootdir):
    #print(root)
    #print(dirs)
    for element in files:
        if '.java' in element:
            dir_string = root + '\\' + element
            full_dir_arr.append(dir_string)

cluster_dict = {}
cluster_tree = {}


for element in full_dir_arr:
    element = element.split('\\')
    child = element[-1]
    parent = element[-2]
    cluster_tree[child] = parent

In [10]:
cluster_tree

{'MavenWrapperDownloader.java': 'wrapper',
 '__name__Application.java': 'java',
 '__name__Configuration.java': 'java',
 'AssetsBundle.java': 'assets',
 'AssetsBundleTest.java': 'assets',
 'Auth.java': 'auth',
 'AuthDynamicFeature.java': 'auth',
 'AuthenticationException.java': 'auth',
 'Authenticator.java': 'auth',
 'AuthFilter.java': 'auth',
 'AuthorizationContext.java': 'auth',
 'Authorizer.java': 'auth',
 'AuthValueFactoryProvider.java': 'auth',
 'CachingAuthenticator.java': 'auth',
 'CachingAuthorizer.java': 'auth',
 'DefaultUnauthorizedHandler.java': 'auth',
 'JSONUnauthorizedHandler.java': 'auth',
 'OptionalPrincipalContainerRequestValueFactory.java': 'auth',
 'PermitAllAuthorizer.java': 'auth',
 'PolymorphicAuthDynamicFeature.java': 'auth',
 'PolymorphicAuthValueFactoryProvider.java': 'auth',
 'PrincipalContainerRequestValueFactory.java': 'auth',
 'PrincipalImpl.java': 'auth',
 'UnauthorizedHandler.java': 'auth',
 'WebApplicationExceptionCatchingFilter.java': 'auth',
 'BasicCred

In [11]:
len(cluster_tree)

815

In [12]:
try:
    os.mkdir('raw_depends/' + project_name)
except:
    pass

command = 'cd C:/Users/tanji/Desktop/SoftwareRemodularization/depends-0.9.2 & ' + f"java -jar depends.jar java C:/Users/tanji/Desktop/SoftwareRemodularization/raw_sourcecode/{project_name}/{project_name}_{version_name}" +  f' ../raw_depends/{project_name}/{project_name}_{version_name}' 
print(command)
os.system(command)

cd C:/Users/tanji/Desktop/SoftwareRemodularization/depends-0.9.2 & java -jar depends.jar java C:/Users/tanji/Desktop/SoftwareRemodularization/raw_sourcecode/dropwizard/dropwizard_v2.0.6 ../raw_depends/dropwizard/dropwizard_v2.0.6


0

In [13]:
with open(depends_dir) as f:
    depends_results = json.load(f)
    index  = 0
    var_array = []
    print(len(depends_results['variables']))
    for value in depends_results['variables']:
        var_array.append([index, value.split('\\')[-1]])
        #print(index, value)
        index += 1

847


In [14]:
with open(depends_dir) as f:
    depends_results = json.load(f)
    index  = 0
    var_array = []
    print(len(depends_results['variables']))
    print(len(depends_results['cells']))
    for value in depends_results['variables']:
        var_array.append([index, value.split('\\')[-1]])
        #print(index, value)
        index += 1

    var_df = pd.DataFrame(var_array)
    var_df.columns = ['index_val', 'name']

    feature_list = {}
    feature_index = 2
    for element in depends_results['cells']:
        #print(element)
        try:
            for a in element['values']:
                if a not in feature_list:
                    feature_list[a] = feature_index
                    feature_index += 1
                #print(a['Call'])


        except:
            pass

    feature_arr = []
    for element in depends_results['cells']:
        #print(array)
        array = [0] * (len(feature_list) + 2)
        values = dict(element['values'])
        #print(element)
        array[0] = element['src']
        array[1] = element['dest']
        for feature in feature_list:
            try:
                value = values[feature]
                array[feature_list[feature]] = value
            except:
                pass
        #print(array)
        feature_arr.append(array)

    feature_df = pd.DataFrame(feature_arr)
    col_names = ['src', 'dest']
    for element in feature_list:
        col_names.append(element)
    feature_df.columns = col_names

    feature_df['sum'] = feature_df.sum(axis=1) - feature_df['src'] - feature_df['dest']
    #G = nx.Graph()
    G = nx.path_graph(len(depends_results['variables']))
    for index, row in feature_df.iterrows():
        G.add_edge(row['src'], row['dest'], weight=row['sum'])

    adj_mat = nx.adjacency_matrix(G)
    adj_mat_df = pd.DataFrame(adj_mat.toarray())
    np.fill_diagonal(adj_mat_df.values, adj_mat_df.values.max())
    x = adj_mat_df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    adj_mat_df = pd.DataFrame(x_scaled)
    
    sum_dependency_df = nx.to_pandas_adjacency(G)
    sum_dependency_df['sum'] = sum_dependency_df.sum(axis=1)
    final_dependency_df = sum_dependency_df['sum']

847
2622


In [15]:
final_dependency_df

0       1.0
1       2.0
2       2.0
3      31.0
4      58.0
       ... 
842    23.0
843    41.0
844    29.0
845    42.0
846     5.0
Name: sum, Length: 847, dtype: float64

In [16]:
adj_mat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,837,838,839,840,841,842,843,844,845,846
0,1.000000,0.000787,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000787,1.000000,0.000787,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000787,1.000000,0.000787,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000787,1.000000,0.011802,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.011802,1.000000,0.000787,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000787,1.000000,0.000787,0.000000,0.000000,0.000000
843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.001574,0.004721,0.0,0.004721,0.000000,0.000787,1.000000,0.003934,0.006294,0.000000
844,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.001574,0.004721,0.0,0.004721,0.000000,0.000000,0.003934,1.000000,0.000787,0.000000
845,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.001574,0.000000,0.0,0.020456,0.003147,0.000000,0.006294,0.000787,1.000000,0.000787


In [17]:
feature_df

Unnamed: 0,src,dest,Call,Create,Contain,Use,Import,Parameter,Annotation,Extend,Return,Implement,Cast,Throw,sum
0,834,829,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,389,303,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,591,579,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,127,312,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,682,681,2.0,0.0,2.0,2.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2617,569,779,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2618,519,512,24.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0
2619,530,747,6.0,0.0,3.0,4.0,1.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,20.0
2620,519,513,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [20]:
print('Currently processing: ' + project_name + version_name)
command = f'java -jar ck-0.3.3-SNAPSHOT-jar-with-dependencies.jar raw_sourcecode/{project_name}/{project_name}_{version_name}'
print(command)
os.system(command)
os.rename('class.csv', project_name + '_' + version_name + '_class.csv')
os.rename('field.csv',  project_name + '_' + version_name + '_field.csv')
os.rename('method.csv', project_name + '_' + version_name + '_method.csv')

shutil.move(project_name + '_' + version_name + '_class.csv', 'ck_results_invi/' + project_name + '_' + version_name + '_class.csv')
shutil.move(project_name + '_' + version_name + '_field.csv', 'ck_results_invi/' + project_name + '_' + version_name + '_field.csv')
shutil.move(project_name + '_' + version_name + '_method.csv', 'ck_results_invi/' + project_name + '_' + version_name + '_method.csv')

          

Currently processing: dropwizardv2.0.6
java -jar ck-0.3.3-SNAPSHOT-jar-with-dependencies.jar raw_sourcecode/dropwizard/dropwizard_v2.0.6


'ck_results_invi/dropwizard_v2.0.6_method.csv'

In [21]:
cluster = AgglomerativeClustering(n_clusters=n_cluster, affinity=affinity, linkage=linkage)
cluster_result = cluster.fit_predict(adj_mat_df)

ck_metrics = pd.read_csv(f'ck_results_invi/{project_name}_{version_name}_class.csv')
ck_metrics['class_name'] = ck_metrics['file'].str.split('\\').str[-1]
filename_a = 'C:/Users/tanji/Desktop/SoftwareRemodularization/MoJo_1.2.1/' + project_name + '/' + project_name + '_' + version_name + '_' + str(n_cluster) + '_' +str(affinity) + '_' + str(linkage) + '_a.rsf'
filename_b = 'C:/Users/tanji/Desktop/SoftwareRemodularization/MoJo_1.2.1/' + project_name + '/' + project_name + '_' + version_name + '_' + str(n_cluster) + '_' +str(affinity) + '_' + str(linkage) + '_b.rsf'

with open(filename_a, 'w') as f:
    for key, value in cluster_tree.items():
        f.write('contain ' + str(value).replace(" ",'') + ' ' + str(key).replace(' ','') + '\n')

arr_a_rsf = []
arr_b_rsf = []
with open(filename_b, 'w') as f:
    len_b_rsf = 0
    duplicate_array = []
    refactor_arr = []
    main_data_arr = []
    clustering_result_dict = {}
    num_line_affected = 0
    num_dependency = 0
    actual_num_of_classes_touched = 0
    for i in range(len(cluster_result)):
        try:
            subject = var_df[var_df['index_val'] == i]['name'].values[0]
            #print(subject)
            #print(subject in cluster_tree)
            if subject in cluster_tree and subject not in duplicate_array:
                #print(element)
                duplicate_array.append(subject)
                string = "contain " + str(cluster_result[i]) + " " + subject + "\n"
                arr_b_rsf.append(subject)
                len_b_rsf += 1
                f.write(string)
                #print(subject)
                class_metrics = ck_metrics.loc[ck_metrics['class_name'] == subject].values.tolist()[0]
                
                #print(ck_metrics.loc[ck_metrics['class_name'] == subject]['file'].values[0])
                
                subjected_file = ck_metrics.loc[ck_metrics['class_name'] == subject]['file'].values[0]
                
                #print(subjected_file)
                
                num_dependency = final_dependency_df.iloc[cluster_result[i]]
                num_line_affected += sum(1 for line in open(subjected_file))
                
                
                class_metrics.append(num_dependency)
                class_metrics.append(num_line_affected)
                main_data_arr.append(class_metrics)
                
                
                ## Important
                ## This is assuming that during Depends, the file structure is already sorted
                ## I.e. going through the directory structure depth wise, not breath wise
                original_parent = cluster_tree[subject]
                try:
                    to_check_parent = clustering_result_dict[cluster_result[i]]
                    #print(original_parent, to_check_parent)
                    if original_parent != to_check_parent:
                        refactor_arr.append(f'Move {subject} to {str(cluster_result[i])}')
                        actual_num_of_classes_touched += 1
                    
                    clustering_result_dict[cluster_result[i]] = to_check_parent
                except:
                    clustering_result_dict[cluster_result[i]] = original_parent
                
            else:
                #print(subject)
                pass
        except Exception as e:
            #print(e)
            pass
f.close()

command = 'cd C:/Users/tanji/Desktop/SoftwareRemodularization/MoJo_1.2.1 & ' + 'java MoJo ' + filename_a + ' ' + filename_b + ' >> ' + project_name + '/' + project_name + '_results.txt'
#print(command)
os.system(command)

0

In [22]:
print(len(refactor_arr))
refactor_arr

489


['Move AuthFilter.java to 0',
 'Move AuthenticationException.java to 0',
 'Move Authenticator.java to 0',
 'Move AuthorizationContext.java to 0',
 'Move Authorizer.java to 0',
 'Move CachingAuthenticator.java to 0',
 'Move CachingAuthorizer.java to 0',
 'Move JSONUnauthorizedHandler.java to 0',
 'Move PermitAllAuthorizer.java to 0',
 'Move PrincipalImpl.java to 0',
 'Move UnauthorizedHandler.java to 0',
 'Move BasicCredentialAuthFilter.java to 0',
 'Move BasicCredentials.java to 0',
 'Move ChainedAuthFilter.java to 0',
 'Move OAuthCredentialAuthFilter.java to 0',
 'Move AbstractAuthResourceConfig.java to 0',
 'Move AuthBaseTest.java to 0',
 'Move AuthFilterTest.java to 0',
 'Move CachingAuthenticatorTest.java to 0',
 'Move CachingAuthorizerTest.java to 0',
 'Move BasicAuthProviderTest.java to 0',
 'Move BasicCredentialsTest.java to 0',
 'Move BasicCustomAuthProviderTest.java to 0',
 'Move ChainedAuthProviderTest.java to 0',
 'Move OAuthCustomProviderTest.java to 0',
 'Move OAuthProvide

In [23]:
temp_dataframe = pd.DataFrame(main_data_arr)
temp_columns = list(ck_metrics.columns)
temp_columns.append('num_dependency')
temp_columns.append('num_line_affected')
temp_dataframe.columns = temp_columns

## For the groupby operation below
temp_dataframe['placeholder_key'] = 'placeholder_key'

In [24]:
temp_dataframe.head()

Unnamed: 0,file,class,type,cbo,wmc,dit,rfc,lcom,totalMethods,staticMethods,...,maxNestedBlocks,anonymousClassesQty,subClassesQty,lambdasQty,uniqueWordsQty,modifiers,class_name,num_dependency,num_line_affected,placeholder_key
0,C:\Users\tanji\Desktop\SoftwareRemodularizatio...,MavenWrapperDownloader,class,1,9,1,17,1,2,2,...,4,0,0,0,82,1,MavenWrapperDownloader.java,37.0,110,placeholder_key
1,C:\Users\tanji\Desktop\SoftwareRemodularizatio...,$.$,class,2,7,1,0,6,4,1,...,0,0,0,0,12,1,__name__Application.java,8.0,139,placeholder_key
2,C:\Users\tanji\Desktop\SoftwareRemodularizatio...,io.dropwizard.assets.AssetsBundle,class,7,14,1,9,0,10,0,...,1,0,0,0,92,1,AssetsBundle.java,1.0,266,placeholder_key
3,C:\Users\tanji\Desktop\SoftwareRemodularizatio...,io.dropwizard.assets.AssetsBundleTest,class,12,9,1,25,4,9,0,...,0,0,0,0,51,1,AssetsBundleTest.java,1.0,423,placeholder_key
4,C:\Users\tanji\Desktop\SoftwareRemodularizatio...,io.dropwizard.auth.AuthDynamicFeature,class,11,13,1,9,0,4,0,...,4,0,0,0,84,1,AuthDynamicFeature.java,19.0,516,placeholder_key


In [29]:
project_refactoring_data_agg = temp_dataframe.groupby(['placeholder_key'], as_index=False).agg({
       'cbo': ['mean', 'min', 'max','std'], 
        'wmc': ['mean', 'min', 'max','std'], 
        'dit': ['mean', 'min', 'max','std'], 
        'rfc': ['mean', 'min', 'max','std'], 
        'lcom': ['mean', 'min', 'max','std'],
        'totalMethods': ['mean', 'min', 'max','std'], 
        'staticMethods': ['mean', 'min', 'max','std'],
        'publicMethods': ['mean', 'min', 'max','std'], 
        'privateMethods': ['mean', 'min', 'max','std'],
        'protectedMethods': ['mean', 'min', 'max','std'], 
        'defaultMethods': ['mean', 'min', 'max','std'], 
        'abstractMethods': ['mean', 'min', 'max','std'], 
        'finalMethods': ['mean', 'min', 'max','std'],
        'synchronizedMethods': ['mean', 'min', 'max','std'],
        'totalFields': ['mean', 'min', 'max','std'], 
        'staticFields': ['mean', 'min', 'max','std'], 
        'publicFields': ['mean', 'min', 'max','std'],
        'privateFields': ['mean', 'min', 'max','std'],
        'protectedFields': ['mean', 'min', 'max','std'],
        'defaultFields': ['mean', 'min', 'max','std'], 
        'finalFields': ['mean', 'min', 'max','std'],
        'synchronizedFields': ['mean', 'min', 'max','std'],
        'nosi': ['mean', 'min', 'max','std'], 
        'loc': ['mean', 'min', 'max','std'], 
        'returnQty': ['mean', 'min', 'max','std'],
        'loopQty': ['mean', 'min', 'max','std'],
        'comparisonsQty': ['mean', 'min', 'max','std'],
        'tryCatchQty': ['mean', 'min', 'max','std'],
        'parenthesizedExpsQty': ['mean', 'min', 'max','std'],
        'stringLiteralsQty': ['mean', 'min', 'max','std'],
        'numbersQty': ['mean', 'min', 'max','std'], 
        'assignmentsQty': ['mean', 'min', 'max','std'],
        'mathOperationsQty': ['mean', 'min', 'max','std'], 
        'variablesQty': ['mean', 'min', 'max','std'], 
        'maxNestedBlocks': ['mean', 'min', 'max','std'],
        'anonymousClassesQty': ['mean', 'min', 'max','std'],
        'subClassesQty': ['mean', 'min', 'max','std'],
        'lambdasQty': ['mean', 'min', 'max','std'],
        'uniqueWordsQty': ['mean', 'min', 'max','std'],
        'modifiers': ['mean', 'min', 'max','std'],
        'num_dependency': ['mean'],
        'num_line_affected': ['mean'],
})

In [30]:
project_refactoring_data_agg

Unnamed: 0_level_0,placeholder_key,cbo,cbo,cbo,cbo,wmc,wmc,wmc,wmc,dit,...,uniqueWordsQty,uniqueWordsQty,uniqueWordsQty,uniqueWordsQty,modifiers,modifiers,modifiers,modifiers,num_dependency,num_line_affected
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,std,mean,min,max,std,mean,...,mean,min,max,std,mean,min,max,std,mean,mean
0,placeholder_key,8.162231,0,63,7.323319,7.046895,0,118,10.127315,1.589354,...,44.91635,0,318,38.813338,44.100127,0,1025,205.074293,13.176172,35347.087452


In [31]:
columns_to_agg = [
    'cbo', 'wmc', 'dit', 'rfc', 'lcom',
       'totalMethods', 'staticMethods', 'publicMethods', 'privateMethods',
       'protectedMethods', 'defaultMethods', 'abstractMethods', 'finalMethods',
       'synchronizedMethods', 'totalFields', 'staticFields', 'publicFields',
       'privateFields', 'protectedFields', 'defaultFields', 'finalFields',
       'synchronizedFields', 'nosi', 'loc', 'returnQty', 'loopQty',
       'comparisonsQty', 'tryCatchQty', 'parenthesizedExpsQty',
       'stringLiteralsQty', 'numbersQty', 'assignmentsQty',
       'mathOperationsQty', 'variablesQty', 'maxNestedBlocks',
       'anonymousClassesQty', 'subClassesQty', 'lambdasQty', 'uniqueWordsQty',
       'modifiers'
]


project_refactoring_data_agg_column_names = ['placeholder_key']
for element in columns_to_agg:
    project_refactoring_data_agg_column_names.append(element + '_mean')
    project_refactoring_data_agg_column_names.append(element + '_min')
    project_refactoring_data_agg_column_names.append(element + '_max')
    project_refactoring_data_agg_column_names.append(element + '_std')
    
    
project_refactoring_data_agg_column_names.append('num_dependency_mean')
project_refactoring_data_agg_column_names.append('num_line_affected_mean')

In [32]:
project_refactoring_data_agg.columns = project_refactoring_data_agg_column_names

In [33]:
project_refactoring_data_agg

Unnamed: 0,placeholder_key,cbo_mean,cbo_min,cbo_max,cbo_std,wmc_mean,wmc_min,wmc_max,wmc_std,dit_mean,...,uniqueWordsQty_mean,uniqueWordsQty_min,uniqueWordsQty_max,uniqueWordsQty_std,modifiers_mean,modifiers_min,modifiers_max,modifiers_std,num_dependency_mean,num_line_affected_mean
0,placeholder_key,8.162231,0,63,7.323319,7.046895,0,118,10.127315,1.589354,...,44.91635,0,318,38.813338,44.100127,0,1025,205.074293,13.176172,35347.087452


In [34]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 51 mins
H2O_cluster_timezone:,Asia/Kuala_Lumpur
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.7
H2O_cluster_version_age:,2 months and 22 days
H2O_cluster_name:,H2O_from_python_tanji_luj4gp
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.925 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [35]:
model_path = "models/EffortEstimationModelv2/GBM_grid__1_AutoML_20211125_154317_model_2"
saved_model = h2o.load_model(model_path)

In [36]:
project_refactoring_data_agg.drop(['placeholder_key'],axis=1, inplace=True)

In [37]:
project_refactoring_data_agg

Unnamed: 0,cbo_mean,cbo_min,cbo_max,cbo_std,wmc_mean,wmc_min,wmc_max,wmc_std,dit_mean,dit_min,...,uniqueWordsQty_mean,uniqueWordsQty_min,uniqueWordsQty_max,uniqueWordsQty_std,modifiers_mean,modifiers_min,modifiers_max,modifiers_std,num_dependency_mean,num_line_affected_mean
0,8.162231,0,63,7.323319,7.046895,0,118,10.127315,1.589354,1,...,44.91635,0,318,38.813338,44.100127,0,1025,205.074293,13.176172,35347.087452


In [38]:
prediction_df = h2o.H2OFrame(project_refactoring_data_agg)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [39]:
preds = saved_model.predict(prediction_df)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [40]:
preds

predict
3.36883




In [41]:
refactor_arr

['Move AuthFilter.java to 0',
 'Move AuthenticationException.java to 0',
 'Move Authenticator.java to 0',
 'Move AuthorizationContext.java to 0',
 'Move Authorizer.java to 0',
 'Move CachingAuthenticator.java to 0',
 'Move CachingAuthorizer.java to 0',
 'Move JSONUnauthorizedHandler.java to 0',
 'Move PermitAllAuthorizer.java to 0',
 'Move PrincipalImpl.java to 0',
 'Move UnauthorizedHandler.java to 0',
 'Move BasicCredentialAuthFilter.java to 0',
 'Move BasicCredentials.java to 0',
 'Move ChainedAuthFilter.java to 0',
 'Move OAuthCredentialAuthFilter.java to 0',
 'Move AbstractAuthResourceConfig.java to 0',
 'Move AuthBaseTest.java to 0',
 'Move AuthFilterTest.java to 0',
 'Move CachingAuthenticatorTest.java to 0',
 'Move CachingAuthorizerTest.java to 0',
 'Move BasicAuthProviderTest.java to 0',
 'Move BasicCredentialsTest.java to 0',
 'Move BasicCustomAuthProviderTest.java to 0',
 'Move ChainedAuthProviderTest.java to 0',
 'Move OAuthCustomProviderTest.java to 0',
 'Move OAuthProvide

In [None]:
final_df = prediction_df
final_df['time_taken_final'] = preds[0]
exm = saved_model.explain_row(prediction_df, row_index=0)

In [42]:
saved_model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid__1_AutoML_20211125_154317_model_2


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,32.0,32.0,5708.0,4.0,7.0,5.09375,8.0,11.0,9.5625




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 405.3148218029351
RMSE: 20.132432088620966
MAE: 12.804343096794575
RMSLE: 1.0783716149070952
Mean Residual Deviance: 405.3148218029351

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 774.7178920351772
RMSE: 27.833754544350953
MAE: 17.73376221538167
RMSLE: 1.351159632768024
Mean Residual Deviance: 774.7178920351772

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,17.733246,2.62976,15.905257,20.52345,20.161343,14.473959,17.602222
1,mean_residual_deviance,774.1871,231.38574,676.3586,1001.3473,1036.5414,517.3253,639.3629
2,mse,774.1871,231.38574,676.3586,1001.3473,1036.5414,517.3253,639.3629
3,r2,0.0642258,0.08141673,0.18830448,0.03161784,-0.03426504,0.054874003,0.080597706
4,residual_deviance,774.1871,231.38574,676.3586,1001.3473,1036.5414,517.3253,639.3629
5,rmse,27.575348,4.1513925,26.006895,31.644072,32.195362,22.744787,25.285625
6,rmsle,1.3447012,0.15235972,1.1728806,1.5456296,1.2907313,1.2577902,1.4564738



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-11-25 15:43:30,0.452 sec,0.0,28.752018,18.874668,826.678565
1,,2021-11-25 15:43:30,0.465 sec,5.0,26.585931,17.322934,706.811735
2,,2021-11-25 15:43:30,0.476 sec,10.0,24.841521,15.936852,617.101188
3,,2021-11-25 15:43:30,0.488 sec,15.0,23.614126,14.970778,557.62695
4,,2021-11-25 15:43:30,0.500 sec,20.0,22.418451,14.205937,502.586924
5,,2021-11-25 15:43:30,0.511 sec,25.0,21.485113,13.598172,461.610088
6,,2021-11-25 15:43:30,0.523 sec,30.0,20.407646,13.01539,416.472033
7,,2021-11-25 15:43:30,0.529 sec,32.0,20.132432,12.804343,405.314822



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,totalFields_mean,36759.464844,1.0,0.061806
1,wmc_mean,33209.566406,0.903429,0.055837
2,maxNestedBlocks_max,33071.519531,0.899674,0.055605
3,uniqueWordsQty_mean,24833.539062,0.675569,0.041754
4,rfc_mean,23301.345703,0.633887,0.039178
5,returnQty_mean,23211.544922,0.631444,0.039027
6,totalMethods_std,20295.273438,0.55211,0.034124
7,totalMethods_mean,20284.013672,0.551804,0.034105
8,variablesQty_mean,17092.384766,0.464979,0.028739
9,num_dependency_mean,16773.087891,0.456293,0.028202



See the whole table with table.as_data_frame()




In [138]:
test_df = pd.read_csv('test_effort_model.csv')

In [139]:
test_df.head()

Unnamed: 0,sha,name,email,date,login,message,parent_sha,parent_date,time_taken,contains_refactoring,...,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std,actual_num_of_classes_touched_mean,actual_num_of_classes_touched_min,actual_num_of_classes_touched_max,actual_num_of_classes_touched_std,time_taken_final
0,6ad1e4fe77445e8689f6d3975b26e52165c9c3e6,Bin Fan,fanbin103@gmail.com,2021-08-04 02:49:16,apc999,Fix IndexOutOfBoundsException on async cache\n\n### What changes a...,df5dcab8bc308dfd2bf650a895865b13120a9866,2021-08-03 16:07:28,10.0,1,...,0.0,764.0,764,764,0.0,9.0,9,9,0.0,10.0
1,8647c6162423b851dda8d10edf4686473d2e95cc,Zac Blanco,zac@alluxio.com,2021-07-15 21:54:27,ZacBlanco,Update and improve conformance of S3 API\n\n### What changes are p...,79a5e5c78b7dfcdbf8edbd928a2ff59c904d08f8,2021-07-15 18:38:50,3.0,1,...,0.0,275.0,275,275,0.0,6.0,6,6,0.0,3.0
2,3ce52983e6f50bfb7880b5a2cb13a18e4272170b,Zhan Yuan,yuanzhanhku@gmail.com,2021-07-12 06:39:42,yuanzhanhku,Add CacheContext to URIStatus to enable per-read metrics\n\nAdd fu...,cf79c7837c57c83c1b99a1dab53fec25deb2069d,2021-07-10 20:58:13,33.0,1,...,0.0,115.0,115,115,0.0,4.5,3,6,2.12132,33.0
3,a4dc54f7dc0333da096aa779dbaa79060c90d1ad,kqhzz,kuangqinghuazz@163.com,2021-06-09 18:09:26,kuszz,Deprecate leader command\n\nFix #13512\n\npr-link: Alluxio/alluxio...,1356c0b35fd753d1081665b2c8f6b25da62bf2e8,2021-06-09 02:31:45,15.0,1,...,0.0,61.0,61,61,0.0,2.0,2,2,0.0,15.0
4,7fb84094a6075bcef5a103b40adcac4b26b724ca,Jiacheng Liu,jiacheliu3@gmail.com,2021-05-29 00:37:05,jiacheliu3,Refactor MasterWorkerInfo\n\nCurrently all the worker metadata are...,220237085593d731756e24249bcf88a4d6ea5710,2021-05-28 23:28:46,1.0,1,...,93.543216,458.666667,24,1245,682.248,35.0,1,54,29.512709,1.0


In [140]:
test_df.drop(columns=['sha',
'name','email','login',
'date','message',
'parent_sha','parent_date',
'time_taken','contains_refactoring',
'kmean_label','mean',
'min','max',
'project_name','commit_compared_with','actual_num_of_classes_touched_mean',
 'actual_num_of_classes_touched_min',
 'actual_num_of_classes_touched_max',
 'actual_num_of_classes_touched_std','time_taken_final']
             ,inplace=True)

In [136]:
test_df

Unnamed: 0,cbo_mean,cbo_min,cbo_max,cbo_std,wmc_mean,wmc_min,wmc_max,wmc_std,dit_mean,dit_min,...,modifiers_std,num_dependency_mean,num_dependency_min,num_dependency_max,num_dependency_std,num_line_affected_mean,num_line_affected_min,num_line_affected_max,num_line_affected_std,time_taken_final
0,38.000000,38.0,38.0,0.000000,57.00,57.0,57.0,0.000000,1.000000,1.0,...,0.000000,322.000000,322.0,322.0,0.000000,764.000000,764,764,0.000000,10.000000
1,12.000000,12.0,12.0,0.000000,17.00,17.0,17.0,0.000000,1.000000,1.0,...,0.000000,15.000000,15.0,15.0,0.000000,275.000000,275,275,0.000000,3.000000
2,4.000000,4.0,4.0,0.000000,15.00,15.0,15.0,0.000000,1.000000,1.0,...,0.000000,371.000000,371.0,371.0,0.000000,115.000000,115,115,0.000000,33.000000
3,11.000000,11.0,11.0,0.000000,8.00,8.0,8.0,0.000000,2.000000,2.0,...,0.000000,85.000000,85.0,85.0,0.000000,61.000000,61,61,0.000000,15.000000
4,27.666667,3.0,76.0,41.860881,50.00,1.0,140.0,78.044859,1.666667,1.0,...,0.000000,69.333333,8.0,177.0,93.543216,458.666667,24,1245,682.248000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,77.000000,77.0,77.0,0.000000,260.00,260.0,260.0,0.000000,4.000000,4.0,...,0.000000,11.000000,11.0,11.0,0.000000,3932.000000,3932,3932,0.000000,123.541667
362,11.250000,7.0,22.0,6.692213,24.00,4.0,54.0,22.309511,1.500000,1.0,...,8.552360,105.000000,4.0,321.0,137.772068,217.500000,67,566,217.340418,1.000000
363,8.500000,2.0,17.0,5.680376,13.25,7.0,19.0,4.753946,1.250000,1.0,...,0.447214,53.000000,5.0,143.0,58.052849,117.500000,31,198,61.085732,1.000000
364,7.000000,7.0,7.0,0.000000,2.00,2.0,2.0,0.000000,1.000000,1.0,...,0.000000,42.000000,42.0,42.0,0.000000,18.000000,18,18,0.000000,1.000000


In [150]:
test_df = test_df.head(1)

In [151]:
prediction_df = h2o.H2OFrame(test_df)
preds = saved_model.predict(prediction_df)

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [152]:
preds

predict
12.5995




In [144]:
len(preds)

366

In [145]:
for element in preds:
    print(element)

predict
12.5995
10.325
15.769
23.9057
9.08408
17.9274
5.38745
11.6393
10.4748
12.5995



