In [1]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import jellyfish
import os
import shutil
import subprocess
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering

In [2]:
def read_dataframe_from_file(filename, filesubstr):
    
    final_results = []
    initial_line = 2 - 1
    initial_line_result = 5 - 1
    
    num_class_f = open('Final_Results/' + filesubstr + '/num_class.txt')
    for line in num_class_f:
        num_class = line
        
    #print(num_class)

    f = open(filename, 'r')
    tmp_arr = []
    for i, line in enumerate(f):
        if i == initial_line:
            line = line.replace('MoJo', '')
            line = line.replace('.rsf', '')
            line = line.replace('(', '')
            line = line.replace(')', '')
            line = line.split('_')
            name = line[0].split('/')[0]
            version = line[0].split('-')[-1]
            n_cluster = line[1]
            affinity = line[2]
            linkage = line[3]
            #print(line)
            #print('Version', version)
            initial_line += 5
            tmp_arr.append(name)
            tmp_arr.append(version)
            tmp_arr.append(n_cluster)
            tmp_arr.append(affinity)
            tmp_arr.append(linkage)
        elif i == initial_line_result:
            n_MoJo = line.split(' ')[-1].split('\n')[0]
            
            #print(line)
            initial_line_result += 5
            tmp_arr.append(n_MoJo)
            tmp_arr.append((1 - (float(n_MoJo) / float(num_class))) * 100)
            final_results.append(tmp_arr)
            tmp_arr = []
            
    
    
    f.close()
    return final_results


def read_bunch_dataframe_from_file(filename, filesubstr):
    
    final_results = []
    initial_line = 2 - 1
    initial_line_result = 5 - 1
    
    num_class_f = open('Final_Results/' + filesubstr + '/num_class.txt')
    for line in num_class_f:
        num_class = line
        
    #print(num_class)

    f = open(filename, 'r')
    tmp_arr = []
    for i, line in enumerate(f):
        try:
            if i == initial_line:
                line = line.replace('MoJo', '')
                line = line.replace('.rsf', '')
                line = line.replace('(', '')
                line = line.replace(')', '')
                line = line.split('_')
                name = line[0].split('/')[0]
                version = line[0].split('-')[-1]
                bunch_algo = line[1]
                #print(line)
                #print('Version', version)
                initial_line += 5
                tmp_arr.append(name)
                tmp_arr.append(version)
                tmp_arr.append(bunch_algo)
            elif i == initial_line_result:
                n_MoJo = line.split(' ')[-1].split('\n')[0]

                #print(line)
                initial_line_result += 5
                tmp_arr.append(n_MoJo)
                tmp_arr.append((1 - (float(n_MoJo) / float(num_class))) * 100)
                final_results.append(tmp_arr)
                tmp_arr = []
        except:
            pass
    
    
    f.close()
    return final_results

In [3]:
# Getting the smallest MoJo Value
rootdir = 'Final_Results/'
project_names = []

f = open('project_list.txt', 'r')
for line in f:
    print(line)
    project_names.append(line.split('\n')[0])

#project_names = ['apache-maven', 'apache-storm']
dir_arr = []

#Main clustering arr to be transformed to pandas dataframe later
cluster_main_arr = []

#Main CK arr to be transformed to pandas dataframe later
ck_sum_arr = []
ck_max_arr = []
ck_std_arr = []
ck_mean_arr = []

# Only considering the below ck_metrics
ck_metrics = ['cbo', 'wmc', 'dit', 'rfc', 'lcom',
       'totalMethods', 'staticMethods', 'publicMethods', 'privateMethods',
       'protectedMethods', 'defaultMethods', 'abstractMethods', 'finalMethods',
       'synchronizedMethods', 'totalFields', 'staticFields', 'publicFields',
       'privateFields', 'protectedFields', 'defaultFields', 'finalFields',
       'synchronizedFields', 'nosi', 'loc', 'returnQty', 'loopQty',
       'comparisonsQty', 'tryCatchQty', 'parenthesizedExpsQty',
       'stringLiteralsQty', 'numbersQty', 'assignmentsQty',
       'mathOperationsQty', 'variablesQty', 'maxNestedBlocks',
       'anonymousClassesQty', 'subClassesQty', 'lambdasQty', 'uniqueWordsQty',
       'modifiers']

depth = 3
for project_name in project_names:
    rootdir += project_name
    #print(rootdir)
    for root, dirs, files in os.walk(rootdir):
        for file in files:
            
            dir_arr.append(rootdir + '/' + str(file))
            
    for element in dir_arr:
        ver = str(element).replace('_class.csv', '')
        ver = ver.split('-')[-1]
        
        if '_class.csv' in element:
            
            df_sum = pd.read_csv(element)
            current_element_sum = df_sum.sum()
            
            df_max = pd.read_csv(element)
            current_element_max = df_max.max()
            
            df_std = pd.read_csv(element)
            current_element_std = df_std.std()
            
            df_mean = pd.read_csv(element)
            current_element_mean = df_mean.mean()
            
            tmp_sum = []
            tmp_max = []
            tmp_std = []
            tmp_mean = []
            
            tmp_sum.append(project_name)
            tmp_sum.append(ver)
            
            tmp_max.append(project_name)
            tmp_max.append(ver)
            
            tmp_std.append(project_name)
            tmp_std.append(ver)
            
            tmp_mean.append(project_name)
            tmp_mean.append(ver)
            
            for element in ck_metrics:
                tmp_sum.append(current_element_sum[element])
                tmp_max.append(current_element_max[element])
                tmp_std.append(current_element_std[element])
                tmp_mean.append(current_element_mean[element])
        
            ck_sum_arr.append(tmp_sum)
            ck_max_arr.append(tmp_max)
            ck_std_arr.append(tmp_std)
            ck_mean_arr.append(tmp_mean)
        
            
        elif '_results.txt' in element:
            #print(project_name, element)
            spark_df = pd.DataFrame(read_dataframe_from_file(element, project_name))
            spark_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo', 'mojoFM']
            #spark_df['mojoFM'] = (1 - spark_df['n_MoJo'].astype(int)/spark_count)*100
            #print(spark_df[spark_df.n_MoJo == spark_df.n_MoJo.min()].iloc[0])
            #print(spark_df.loc[spark_df.groupby('version').mojoFM.idxmax()].reset_index(drop=True))
            for element in spark_df.loc[spark_df.groupby('version').mojoFM.idxmax()].reset_index(drop=True).iterrows():
                tmp_arr2 = []
                tmp_arr2.append(element[1]['name'])
                tmp_arr2.append(element[1]['version'])
                tmp_arr2.append(element[1]['n_clusters'])
                tmp_arr2.append(element[1]['affinity'])
                tmp_arr2.append(element[1]['linkage'])
                tmp_arr2.append(element[1]['n_MoJo'])
                tmp_arr2.append(element[1]['mojoFM'])

                cluster_main_arr.append(tmp_arr2)
    #cluster_main_arr.append(spark_df.loc[spark_df.groupby('version').mojoFM.idxmax()].reset_index(drop=True))
            #cluster_main_arr.append(spark_df[spark_df.n_MoJo == spark_df.n_MoJo.min()].iloc[0])
    
    rootdir = 'Final_Results/'
    #dir_arr = []
            

apache-storm

apache-cassandra

airbnb-lottie-android

apache-isis

apache-jmeter

apache-log4j

apache-maven

apache-spark

apache-tomcat

rzwitserloot-lombok

apache-tika

alibaba-fastjson

activiti-activiti

bumptech-glide

codecentric-spring-boot-admin

dropwizard-dropwizard

dropwizard-metrics

facebook-facebook-android-sdk

google-dagger

google-error-prone

grpc-grpc-java

java-native-accessjna

jenkinsci-jenkins

jhy-jsoup

mockito-mockito

mybatis-mybatis-3

naver-pinpoint

pxb1988-dex2jar

ReactiveX-RxJava

redisson-redisson

swagger-api-swagger-core



### Getting Bunch Result

In [33]:
# Getting the smallest MoJo Value
rootdir = 'Final_Results_Bunch/'
project_names = []
dir_arr = []

f = open('project_list.txt', 'r')
for line in f:
    #print(line)
    project_names.append(line.split('\n')[0])

f.close()
cluster_main_arr_bunch = []

depth = 3
for project_name in project_names:
    rootdir += project_name
    #print(rootdir)
    for root, dirs, files in os.walk(rootdir):
        for file in files:
            
            dir_arr.append(rootdir + '/' + str(file))
    rootdir = 'Final_Results_Bunch/'
    
for element in dir_arr:

    #print(element)
    project_name = element.split('/')[1]
    if '_results_bunch.txt' in element:
        print(project_name, element)

        spark_df = pd.DataFrame(read_bunch_dataframe_from_file(element, project_name))
        spark_df.columns = ['name', 'version', 'bunch_algo', 'n_MoJo', 'mojoFM']
        #spark_df['mojoFM'] = (1 - spark_df['n_MoJo'].astype(int)/spark_count)*100
        #print(spark_df[spark_df.n_MoJo == spark_df.n_MoJo.min()].iloc[0])
        #print(spark_df.loc[spark_df.groupby('version').mojoFM.idxmax()].reset_index(drop=True))

        try:
            for element in spark_df.loc[spark_df.groupby(['name','version']).mojoFM.idxmax()].reset_index(drop=True).iterrows():
                tmp_arr2 = []
                tmp_arr2.append(element[1]['name'])
                tmp_arr2.append(element[1]['version'])
                tmp_arr2.append(element[1]['bunch_algo'])
                tmp_arr2.append(element[1]['n_MoJo'])
                tmp_arr2.append(element[1]['mojoFM'])

                cluster_main_arr_bunch.append(tmp_arr2)
        except:
            print(element)
    #cluster_main_arr.append(spark_df.loc[spark_df.groupby('version').mojoFM.idxmax()].reset_index(drop=True))
            #cluster_main_arr.append(spark_df[spark_df.n_MoJo == spark_df.n_MoJo.min()].iloc[0])
    
    
    #dir_arr = []



apache-storm Final_Results_Bunch/apache-storm/apache-storm_results_bunch.txt
apache-cassandra Final_Results_Bunch/apache-cassandra/apache-cassandra_results_bunch.txt
airbnb-lottie-android Final_Results_Bunch/airbnb-lottie-android/airbnb-lottie-android_results_bunch.txt
apache-isis Final_Results_Bunch/apache-isis/apache-isis_results_bunch.txt
apache-jmeter Final_Results_Bunch/apache-jmeter/apache-jmeter_results_bunch.txt
apache-log4j Final_Results_Bunch/apache-log4j/apache-log4j_results_bunch.txt
apache-maven Final_Results_Bunch/apache-maven/apache-maven_results_bunch.txt
apache-spark Final_Results_Bunch/apache-spark/apache-spark_results_bunch.txt
apache-tomcat Final_Results_Bunch/apache-tomcat/apache-tomcat_results_bunch.txt
rzwitserloot-lombok Final_Results_Bunch/rzwitserloot-lombok/rzwitserloot-lombok_results_bunch.txt
apache-tika Final_Results_Bunch/apache-tika/apache-tika_results_bunch.txt
alibaba-fastjson Final_Results_Bunch/alibaba-fastjson/alibaba-fastjson_results_bunch.txt
acti

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


 Final_Results_Bunch/hibernate-hibernate-orm/hibernate-hibernate-orm_results_bunch.txt
springfox-springfox Final_Results_Bunch/springfox-springfox/springfox-springfox_results_bunch.txt
spring-projects-spring-security Final_Results_Bunch/spring-projects-spring-security/spring-projects-spring-security_results_bunch.txt
jankotek-mapdb Final_Results_Bunch/jankotek-mapdb/jankotek-mapdb_results_bunch.txt
bytedeco-javacpp Final_Results_Bunch/bytedeco-javacpp/bytedeco-javacpp_results_bunch.txt
testcontainers-testcontainers-java Final_Results_Bunch/testcontainers-testcontainers-java/testcontainers-testcontainers-java_results_bunch.txt
sofastack-sofa-rpc Final_Results_Bunch/sofastack-sofa-rpc/sofastack-sofa-rpc_results_bunch.txt
knowm-XChange Final_Results_Bunch/knowm-XChange/knowm-XChange_results_bunch.txt
hazelcast-hazelcast Final_Results_Bunch/hazelcast-hazelcast/hazelcast-hazelcast_results_bunch.txt
cryptomator-cryptomator Final_Results_Bunch/cryptomator-cryptomator/cryptomator-cryptomator_r

In [35]:
#print(cluster_main_arr_bunch)
final_clustering_result_bunch = pd.DataFrame(cluster_main_arr_bunch)
final_clustering_result_bunch.columns = ['name', 'version', 'bunch_algo', 'n_MoJo', 'mojoFM']
#final_clustering_result_bunch = final_clustering_result_bunch.dropna()
final_clustering_result_bunch.head(30)

Unnamed: 0,name,version,bunch_algo,n_MoJo,mojoFM
0,apache-storm,1.1.0,ga,1632,20.583942
1,apache-storm,1.1.1,hillclimbing,1693,17.615572
2,apache-storm,1.1.2,hillclimbing,1224,40.437956
3,apache-storm,1.1.3,ga,1290,37.226277
4,apache-storm,1.1.4,ga,1495,27.250608
5,apache-storm,1.2.0,hillclimbing,1595,22.384428
6,apache-storm,1.2.1,hillclimbing,847,58.783455
7,apache-storm,1.2.2,exhaustive,1518,26.131387
8,apache-storm,1.2.3,hillclimbing,1195,41.849148
9,apache-storm,2.0.0,ga,2219,-7.980535


In [36]:
print(len(final_clustering_result_bunch['name'].unique()))
print(final_clustering_result_bunch['name'].unique())

63
['apache-storm' 'apache-cassandra' 'airbnb-lottie-android' 'apache-isis'
 'apache-jmeter' 'apache-log4j' 'apache-maven' 'apache-spark'
 'apache-tomcat' nan 'rzwitserloot-lombok' 'apache-tika'
 'alibaba-fastjson' 'activiti-activiti' 'bumptech-glide'
 'codecentric-spring-boot-admin' 'dropwizard-dropwizard'
 'dropwizard-metrics' 'facebook-facebook-android-sdk' 'google-dagger'
 'google-error-prone' 'grpc-grpc-java' 'java-native-accessjna'
 'jenkinsci-jenkins' 'jhy-jsoup' 'mockito-mockito' 'mybatis-mybatis-3'
 'naver-pinpoint' 'pxb1988-dex2jar' 'ReactiveX-RxJava' 'redisson-redisson'
 'swagger-api-swagger-core' 'hibernate-hibernate-orm'
 'springfox-springfox' 'spring-projects-spring-security' 'jankotek-mapdb'
 'bytedeco-javacpp' 'testcontainers-testcontainers-java'
 'sofastack-sofa-rpc' 'knowm-XChange' 'hazelcast-hazelcast'
 'cryptomator-cryptomator' 'btraceio-btrace' 'auth0-java-jwt' 'oblac-jodd'
 'javaparser-javaparser' 'iSoron-uhabits' 'oracle-opengrok'
 'lettuce-io-lettuce-core' 'quer

In [37]:
final_clustering_result_bunch.to_csv('bunch_clustering_result_63_ver.csv', index=False)

### For clustering

In [42]:
final_clustering_result = pd.DataFrame(cluster_main_arr)
final_clustering_result.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo', 'mojoFM']

NameError: name 'cluster_main_arr' is not defined

In [32]:
final_clustering_result.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache-storm,1.1.0,100,l2,single,1568,23.698297
1,apache-storm,1.1.1,50,euclidean,single,1574,23.406326
2,apache-storm,1.1.2,50,cosine,single,508,75.279805
3,apache-storm,1.1.3,50,cosine,single,511,75.13382
4,apache-storm,1.1.4,50,cosine,single,505,75.425791


In [33]:
print(len(final_clustering_result['name'].unique()))
final_clustering_result['name'].unique()


31


array(['apache-storm', 'apache-cassandra', 'airbnb-lottie-android',
       'apache-isis', 'apache-jmeter', 'apache-log4j', 'apache-maven',
       'apache-spark', 'apache-tomcat', 'rzwitserloot-lombok',
       'apache-tika', 'alibaba-fastjson', 'activiti-activiti',
       'bumptech-glide', 'codecentric-spring-boot-admin',
       'dropwizard-dropwizard', 'dropwizard-metrics',
       'facebook-facebook-android-sdk', 'google-dagger',
       'google-error-prone', 'grpc-grpc-java', 'java-native-accessjna',
       'jenkinsci-jenkins', 'jhy-jsoup', 'mockito-mockito',
       'mybatis-mybatis-3', 'naver-pinpoint', 'pxb1988-dex2jar',
       'ReactiveX-RxJava', 'redisson-redisson',
       'swagger-api-swagger-core'], dtype=object)

### For producing temp output files

Temp Output files
1. clustering_result_10_ver
2. ck_sum
3. ck_max
4. ck_std
5. ck_mean

In [34]:
final_clustering_result.to_csv('clustering_result_30_ver.csv', index=False)

### For CK Metrics

In [35]:
# For SUM CK Metric

ck_sum_df = pd.DataFrame(ck_sum_arr)
final_ck_columns = ['project_name', 'version']
for element in ck_metrics:
    final_ck_columns.append(element)
ck_sum_df.columns = final_ck_columns
ck_sum_df.head()

ck_sum_df.to_csv('ck_sum_30_ver.csv', index=False)

In [36]:
# For Max CK Metric

ck_max_df = pd.DataFrame(ck_max_arr)
final_ck_columns = ['project_name', 'version']
for element in ck_metrics:
    final_ck_columns.append(element)
ck_max_df.columns = final_ck_columns
ck_max_df.head()

ck_max_df.to_csv('ck_max_30_ver.csv', index=False)

In [37]:
# For SUM CK Metric

ck_std_df = pd.DataFrame(ck_std_arr)
final_ck_columns = ['project_name', 'version']
for element in ck_metrics:
    final_ck_columns.append(element)
ck_std_df.columns = final_ck_columns
ck_std_df.head()

ck_std_df.to_csv('ck_std_30_ver.csv', index=False)

In [38]:
# For SUM CK Metric

ck_mean_df = pd.DataFrame(ck_mean_arr)
final_ck_columns = ['project_name', 'version']
for element in ck_metrics:
    final_ck_columns.append(element)
ck_mean_df.columns = final_ck_columns
ck_mean_df.head()

ck_mean_df.to_csv('ck_mean_30_ver.csv', index=False)

In [41]:
final_clustering_result.head()

NameError: name 'final_clustering_result' is not defined

### To convert to Dr Aldeida's format

In [39]:
final_clustering_result['Instance'] = final_clustering_result['name'] + '_' + final_clustering_result['version']
final_clustering_result['n_clusters_str'] = final_clustering_result['n_clusters'].astype(str)


In [40]:
ck_sum_df['Instance'] = ck_sum_df['project_name'] + '_' + ck_sum_df['version']
ck_max_df['Instance'] = ck_max_df['project_name'] + '_' + ck_max_df['version']

#ck_sum_df.head()

temp1 = ck_sum_df.merge(ck_max_df, on='Instance', suffixes=('_sum', '_max') )

ck_std_df['Instance'] = ck_std_df['project_name'] + '_' + ck_std_df['version']
ck_mean_df['Instance'] = ck_mean_df['project_name'] + '_' + ck_mean_df['version']

#ck_std_df.head()

temp2 = ck_sum_df.merge(ck_max_df, on='Instance', suffixes=('_std', '_mean') )

temp1 = temp1.merge(temp2, on='Instance')
final_clustering_result = final_clustering_result.merge(temp1, on='Instance')

In [41]:
final_clustering_result['Label'] = final_clustering_result['n_clusters_str'] + '_' + final_clustering_result['affinity'] + '_' +final_clustering_result['linkage'] 

In [42]:
columns_to_avoid = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo', 'mojoFM']
total_cancer = []
for element in final_clustering_result.columns:
    #print(element)
    for cancer in columns_to_avoid:
        print(cancer)
        if cancer in element:
            total_cancer.append(element)
            
total_cancer

name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
version
n_clusters
affinity
linkage
n_MoJo
mojoFM
name
versi

['name',
 'version',
 'n_clusters',
 'affinity',
 'linkage',
 'n_MoJo',
 'mojoFM',
 'n_clusters_str',
 'project_name_sum',
 'version_sum',
 'project_name_max',
 'version_max',
 'project_name_std',
 'version_std',
 'project_name_mean',
 'version_mean']

In [43]:
final_clustering_result = final_clustering_result.drop(columns=total_cancer)

In [44]:
final_clustering_result.head()

Unnamed: 0,Instance,cbo_sum,wmc_sum,dit_sum,rfc_sum,lcom_sum,totalMethods_sum,staticMethods_sum,publicMethods_sum,privateMethods_sum,...,assignmentsQty_mean,mathOperationsQty_mean,variablesQty_mean,maxNestedBlocks_mean,anonymousClassesQty_mean,subClassesQty_mean,lambdasQty_mean,uniqueWordsQty_mean,modifiers_mean,Label
0,apache-storm_1.1.1,13675,27066,3097,23667,117111,13332,1528,10868,1235,...,392,57,333,7,45,593,0,1178,1025,50_euclidean_single
1,apache-storm_1.1.2,13930,27294,3120,24140,117459,13488,1530,10990,1248,...,392,57,333,7,45,593,0,1187,1025,50_cosine_single
2,apache-storm_1.1.3,13974,27385,3128,24221,118054,13522,1549,11006,1263,...,392,53,333,7,45,593,0,1191,1025,50_cosine_single
3,apache-storm_1.1.4,13975,27391,3128,24209,118196,13524,1551,11010,1262,...,392,53,333,7,45,593,0,1191,1025,50_cosine_single
4,apache-storm_1.2.0,14345,28032,3194,24958,119489,13964,1589,11303,1340,...,392,57,333,7,45,593,0,1209,1025,50_cosine_single


In [45]:
for element in final_clustering_result.columns:
    print(element)

Instance
cbo_sum
wmc_sum
dit_sum
rfc_sum
lcom_sum
totalMethods_sum
staticMethods_sum
publicMethods_sum
privateMethods_sum
protectedMethods_sum
defaultMethods_sum
abstractMethods_sum
finalMethods_sum
synchronizedMethods_sum
totalFields_sum
staticFields_sum
publicFields_sum
privateFields_sum
protectedFields_sum
defaultFields_sum
finalFields_sum
synchronizedFields_sum
nosi_sum
loc_sum
returnQty_sum
loopQty_sum
comparisonsQty_sum
tryCatchQty_sum
parenthesizedExpsQty_sum
stringLiteralsQty_sum
numbersQty_sum
assignmentsQty_sum
mathOperationsQty_sum
variablesQty_sum
maxNestedBlocks_sum
anonymousClassesQty_sum
subClassesQty_sum
lambdasQty_sum
uniqueWordsQty_sum
modifiers_sum
cbo_max
wmc_max
dit_max
rfc_max
lcom_max
totalMethods_max
staticMethods_max
publicMethods_max
privateMethods_max
protectedMethods_max
defaultMethods_max
abstractMethods_max
finalMethods_max
synchronizedMethods_max
totalFields_max
staticFields_max
publicFields_max
privateFields_max
protectedFields_max
defaultFields_max
fina

In [46]:
final_clustering_result.to_csv('remodularization_input_30.csv', index=False)

In [47]:
test1 = final_clustering_result.copy()
test1.head()

Unnamed: 0,Instance,cbo_sum,wmc_sum,dit_sum,rfc_sum,lcom_sum,totalMethods_sum,staticMethods_sum,publicMethods_sum,privateMethods_sum,...,assignmentsQty_mean,mathOperationsQty_mean,variablesQty_mean,maxNestedBlocks_mean,anonymousClassesQty_mean,subClassesQty_mean,lambdasQty_mean,uniqueWordsQty_mean,modifiers_mean,Label
0,apache-storm_1.1.1,13675,27066,3097,23667,117111,13332,1528,10868,1235,...,392,57,333,7,45,593,0,1178,1025,50_euclidean_single
1,apache-storm_1.1.2,13930,27294,3120,24140,117459,13488,1530,10990,1248,...,392,57,333,7,45,593,0,1187,1025,50_cosine_single
2,apache-storm_1.1.3,13974,27385,3128,24221,118054,13522,1549,11006,1263,...,392,53,333,7,45,593,0,1191,1025,50_cosine_single
3,apache-storm_1.1.4,13975,27391,3128,24209,118196,13524,1551,11010,1262,...,392,53,333,7,45,593,0,1191,1025,50_cosine_single
4,apache-storm_1.2.0,14345,28032,3194,24958,119489,13964,1589,11303,1340,...,392,57,333,7,45,593,0,1209,1025,50_cosine_single


In [48]:
test1 = test1.drop(columns=['Label'])

In [49]:
test1.head()

Unnamed: 0,Instance,cbo_sum,wmc_sum,dit_sum,rfc_sum,lcom_sum,totalMethods_sum,staticMethods_sum,publicMethods_sum,privateMethods_sum,...,numbersQty_mean,assignmentsQty_mean,mathOperationsQty_mean,variablesQty_mean,maxNestedBlocks_mean,anonymousClassesQty_mean,subClassesQty_mean,lambdasQty_mean,uniqueWordsQty_mean,modifiers_mean
0,apache-storm_1.1.1,13675,27066,3097,23667,117111,13332,1528,10868,1235,...,613,392,57,333,7,45,593,0,1178,1025
1,apache-storm_1.1.2,13930,27294,3120,24140,117459,13488,1530,10990,1248,...,613,392,57,333,7,45,593,0,1187,1025
2,apache-storm_1.1.3,13974,27385,3128,24221,118054,13522,1549,11006,1263,...,613,392,53,333,7,45,593,0,1191,1025
3,apache-storm_1.1.4,13975,27391,3128,24209,118196,13524,1551,11010,1262,...,613,392,53,333,7,45,593,0,1191,1025
4,apache-storm_1.2.0,14345,28032,3194,24958,119489,13964,1589,11303,1340,...,613,392,57,333,7,45,593,0,1209,1025


In [6]:
test_bunch = pd.DataFrame(final_results)
test_bunch.head()

Unnamed: 0,0,1,2,3,4,5,6
0,activiti-activiti,7.1.74,hillclimbing,"a,activiti-activiti/activiti-activiti-7.1.74",hillclimbing,1954,52.503646
1,activiti-activiti,7.1.74,ga,"a,activiti-activiti/activiti-activiti-7.1.74",ga,1704,58.580457
2,activiti-activiti,7.1.74,exhaustive,"a,activiti-activiti/activiti-activiti-7.1.74",exhaustive,2025,50.777832
3,activiti-activiti,7.1.75,hillclimbing,"a,activiti-activiti/activiti-activiti-7.1.75",hillclimbing,1987,51.701507
4,activiti-activiti,7.1.75,ga,"a,activiti-activiti/activiti-activiti-7.1.75",ga,1877,54.375304
