In [4]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import jellyfish
import os
import shutil
import subprocess
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering



In [24]:
def read_dataframe_from_file(filename, filesubstr):
    f = open(filename, 'r')
    final_results = []
    for line in f:
        if 'MoJo' in line:
            temp_arr= []
            line = line.replace('MoJo', '')
            line = line.replace('test', '')
            #line = line.replace('/', '')
            line = line.replace('(', '')
            line = line.replace(')', '')
            line = line.replace('=', ',')
            line = line.replace('\n', '')
            line = line.split(',')
            #print(line)
            details = line[0].split('_')
            #print(details)
            filename = line[0].split('/')[0]

            version = details[1].split('/')[1].replace(filesubstr,'')
            temp_arr.append(filename)
            temp_arr.append(version)
            temp_arr.append(details[2])
            temp_arr.append(details[3])
            temp_arr.append(details[4])
            temp_arr.append(line[-1])
            final_results.append(temp_arr)
            
    f.close()
    return final_results

In [25]:
log4j_count = 324
tomcat_count = 3406
jmeter_count = 1296
spark_count = 13864
cassandra_count = 2309
tika_count = 834
maven_count = 1813
storm_count = 2031

In [26]:
spark_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_spark/apache_spark_results.txt', 'spark-'))
spark_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
spark_df['mojoFM'] = (1 - spark_df['n_MoJo'].astype(int)/spark_count)*100
spark_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_spark,2.1.3,50,euclidean,complete,597,95.693883
1,apache_spark,2.1.3,50,euclidean,complete,462,96.667628
2,apache_spark,2.1.3,50,euclidean,average,595,95.708309
3,apache_spark,2.1.3,50,euclidean,average,472,96.595499
4,apache_spark,2.1.3,50,euclidean,single,584,95.787651


In [30]:
log4j_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_log4j/apache_log4j_results.txt', 'log4j-'))
log4j_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
log4j_df['mojoFM'] = (1 - log4j_df['n_MoJo'].astype(int)/log4j_count)*100
log4j_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_log4j,1.2.13,20,euclidean,complete,255,21.296296
1,apache_log4j,1.2.13,20,euclidean,complete,224,30.864198
2,apache_log4j,1.2.13,20,euclidean,average,257,20.679012
3,apache_log4j,1.2.13,20,euclidean,average,224,30.864198
4,apache_log4j,1.2.13,20,euclidean,single,252,22.222222


In [32]:
tomcat_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_tomcat/apache_tomcat_results.txt', 'tomcat-'))
tomcat_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
tomcat_df['mojoFM'] = (1 - tomcat_df['n_MoJo'].astype(int)/tomcat_count)*100
tomcat_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_tomcat,9.0.20,50,euclidean,complete,2104,38.226659
1,apache_tomcat,9.0.20,50,euclidean,complete,242,92.894891
2,apache_tomcat,9.0.20,50,euclidean,average,2092,38.578978
3,apache_tomcat,9.0.20,50,euclidean,average,327,90.399295
4,apache_tomcat,9.0.20,50,euclidean,single,2175,36.142102


In [34]:
jmeter_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_jmeter/apache_jmeter_results.txt', 'jmeter-'))
jmeter_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
jmeter_df['mojoFM'] = (1 - jmeter_df['n_MoJo'].astype(int)/jmeter_count)*100
jmeter_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_jmeter,2.5,50,euclidean,complete,813,37.268519
1,apache_jmeter,2.5,50,euclidean,complete,394,69.598765
2,apache_jmeter,2.5,50,euclidean,average,803,38.040123
3,apache_jmeter,2.5,50,euclidean,average,386,70.216049
4,apache_jmeter,2.5,50,euclidean,single,807,37.731481


In [36]:
cassandra_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_cassandra/apache_cassandra_results.txt', 'cassandra-'))
cassandra_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
cassandra_df['mojoFM'] = (1 - cassandra_df['n_MoJo'].astype(int)/cassandra_count)*100
cassandra_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_cassandra,3.11.3,50,euclidean,complete,2027,12.213079
1,apache_cassandra,3.11.3,50,euclidean,complete,683,70.420095
2,apache_cassandra,3.11.3,50,euclidean,average,2042,11.563447
3,apache_cassandra,3.11.3,50,euclidean,average,838,63.707233
4,apache_cassandra,3.11.3,50,euclidean,single,2036,11.8233


In [38]:
tika_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_tika/apache_tika_results.txt', 'tika-'))
tika_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
tika_df['mojoFM'] = (1 - tika_df['n_MoJo'].astype(int)/tika_count)*100
tika_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_tika,1.19.1,50,euclidean,complete,929,-11.390887
1,apache_tika,1.19.1,50,euclidean,complete,776,6.954436
2,apache_tika,1.19.1,50,euclidean,average,920,-10.311751
3,apache_tika,1.19.1,50,euclidean,average,789,5.395683
4,apache_tika,1.19.1,50,euclidean,single,903,-8.273381


In [40]:
maven_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_maven/apache_maven_results.txt', 'maven-'))
maven_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
maven_df['mojoFM'] = (1 - maven_df['n_MoJo'].astype(int)/maven_count)*100
maven_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_maven,3.3.4,50,euclidean,complete,705,61.114175
1,apache_maven,3.3.4,50,euclidean,complete,205,88.692774
2,apache_maven,3.3.4,50,euclidean,average,765,57.804744
3,apache_maven,3.3.4,50,euclidean,average,595,67.181467
4,apache_maven,3.3.4,50,euclidean,single,797,56.039713


In [41]:
storm_df = pd.DataFrame(read_dataframe_from_file('MoJo_1.2.1/apache_storm/apache_storm_results.txt', 'storm-'))
storm_df.columns = ['name', 'version', 'n_clusters', 'affinity', 'linkage', 'n_MoJo']
storm_df['mojoFM'] = (1 - storm_df['n_MoJo'].astype(int)/storm_count)*100
storm_df.head()

Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_storm,1.1.0,50,euclidean,complete,1695,16.543575
1,apache_storm,1.1.0,50,euclidean,complete,1602,21.1226
2,apache_storm,1.1.0,50,euclidean,average,1692,16.691285
3,apache_storm,1.1.0,50,euclidean,average,1622,20.137863
4,apache_storm,1.1.0,50,euclidean,single,1692,16.691285


In [43]:
final_df_results = pd.concat([storm_df, maven_df, tika_df, cassandra_df, spark_df, jmeter_df, tomcat_df, log4j_df])
print(final_df_results.size)
final_df_results.head()

101850


Unnamed: 0,name,version,n_clusters,affinity,linkage,n_MoJo,mojoFM
0,apache_storm,1.1.0,50,euclidean,complete,1695,16.543575
1,apache_storm,1.1.0,50,euclidean,complete,1602,21.1226
2,apache_storm,1.1.0,50,euclidean,average,1692,16.691285
3,apache_storm,1.1.0,50,euclidean,average,1622,20.137863
4,apache_storm,1.1.0,50,euclidean,single,1692,16.691285


In [44]:
final_df_results.to_csv('3rd_iteration_complete_results.csv', index=False)