<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/src/automation/table_automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import subprocess
import io

In [2]:
!mkdir -p datasets
!mkdir -p results/ratios
!mkdir -p results/combined

In [46]:
learners = ['lightGBM', 'GB', 'LR', 'SVM', 'DT']
metrics = ['ACC', 'AUC_ROC', 'F1', 'PREC', 'REC']
datasets = ['ambari_vuln', 'defect_mylyn', 'eclipse_JDT', 'eclipse_PDE', 'js_vuln', 'moodle_vuln']

algos = ['svm_smote', 'No_Sampling', 'RRP', 'gaussian_copula', 'intelligent_pruning', 'random_oversampling', 'random_pruning', 'smote']
ratios = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0']

In [47]:
combined = {a:{i:{j:{ k:[False, float('-inf')] for k in algos} for j in datasets} for i in metrics} for a in learners}
# combined['ACC']['ambari_vuln']

In [48]:
for dataset in datasets:
  for learner in learners:
    for metric in metrics:
      file_name = f"{metric}_{learner}_scotty_knotty_{dataset}.txt"
      print(file_name, " " * (55 - len(file_name)), end = "")
      path = f"https://raw.githubusercontent.com/adipai/data-decent/main/results/{dataset}/sk/{file_name}"
      command = f"curl -o {'datasets/' + file_name} {path}"
      result = subprocess.check_output(command, shell=True, text=True)
      print("Done")

ACC_lightGBM_scotty_knotty_ambari_vuln.txt              Done
AUC_ROC_lightGBM_scotty_knotty_ambari_vuln.txt          Done
F1_lightGBM_scotty_knotty_ambari_vuln.txt               Done
PREC_lightGBM_scotty_knotty_ambari_vuln.txt             Done
REC_lightGBM_scotty_knotty_ambari_vuln.txt              Done
ACC_GB_scotty_knotty_ambari_vuln.txt                    Done
AUC_ROC_GB_scotty_knotty_ambari_vuln.txt                Done
F1_GB_scotty_knotty_ambari_vuln.txt                     Done
PREC_GB_scotty_knotty_ambari_vuln.txt                   Done
REC_GB_scotty_knotty_ambari_vuln.txt                    Done
ACC_LR_scotty_knotty_ambari_vuln.txt                    Done
AUC_ROC_LR_scotty_knotty_ambari_vuln.txt                Done
F1_LR_scotty_knotty_ambari_vuln.txt                     Done
PREC_LR_scotty_knotty_ambari_vuln.txt                   Done
REC_LR_scotty_knotty_ambari_vuln.txt                    Done
ACC_SVM_scotty_knotty_ambari_vuln.txt                   Done
AUC_ROC_SVM_scotty_knott

In [49]:
def process_file(file_name):
  results = { i:{ j:[False, float('-inf')] for j in algos} for i in ratios }
  with open(file_name, 'r') as file:
    for line in file:
      if line == '#\n':
        continue
      data = [x.strip() for x in line.split(",")]
      rank = data[0]
      algo_raw =  data[1].split('_')
      algo = ""
      for sam in algo_raw:
        if sam[0].isalpha():
          algo += "_" + sam
      algo = algo[1:]
      if "intelligent_pruning" in algo:
        algo = "intelligent_pruning"
      ratio = algo_raw[-1]
      val = data[2]
      results[ratio][algo][0] = results[ratio][algo][0] or rank == '0'
      results[ratio][algo][1] = max(results[ratio][algo][1], float(val))
      # print(rank, algo, ratio, val, rank == '0')
    return pd.DataFrame(results)

In [50]:
for dataset in datasets:
  for learner in learners:
    for metric in metrics:
      file_name = f"{metric}_{learner}_scotty_knotty_{dataset}.txt"
      result_file_name = f"{metric}_{learner}_{dataset}.csv"
      print(file_name)
      results = process_file("datasets/" + file_name)

      zero_rank = False
      max_val = float('-inf')

      for algo in algos:
        for ratio in ratios:
          zero_rank = zero_rank or results[ratio][algo][0]
          max_val = max(max_val, float(results[ratio][algo][1]))
        combined[learner][metric][dataset][algo][0] = zero_rank
        combined[learner][metric][dataset][algo][1] = max_val

      for ratio in ratios:
        for algo in algos:
          results[ratio][algo] = f"({results[ratio][algo][1]})" if results[ratio][algo][0] else str(results[ratio][algo][1])

      results.to_csv("results/ratios/" + result_file_name)

ACC_lightGBM_scotty_knotty_ambari_vuln.txt
AUC_ROC_lightGBM_scotty_knotty_ambari_vuln.txt
F1_lightGBM_scotty_knotty_ambari_vuln.txt
PREC_lightGBM_scotty_knotty_ambari_vuln.txt
REC_lightGBM_scotty_knotty_ambari_vuln.txt
ACC_GB_scotty_knotty_ambari_vuln.txt
AUC_ROC_GB_scotty_knotty_ambari_vuln.txt
F1_GB_scotty_knotty_ambari_vuln.txt
PREC_GB_scotty_knotty_ambari_vuln.txt
REC_GB_scotty_knotty_ambari_vuln.txt
ACC_LR_scotty_knotty_ambari_vuln.txt
AUC_ROC_LR_scotty_knotty_ambari_vuln.txt
F1_LR_scotty_knotty_ambari_vuln.txt
PREC_LR_scotty_knotty_ambari_vuln.txt
REC_LR_scotty_knotty_ambari_vuln.txt
ACC_SVM_scotty_knotty_ambari_vuln.txt
AUC_ROC_SVM_scotty_knotty_ambari_vuln.txt
F1_SVM_scotty_knotty_ambari_vuln.txt
PREC_SVM_scotty_knotty_ambari_vuln.txt
REC_SVM_scotty_knotty_ambari_vuln.txt
ACC_DT_scotty_knotty_ambari_vuln.txt
AUC_ROC_DT_scotty_knotty_ambari_vuln.txt
F1_DT_scotty_knotty_ambari_vuln.txt
PREC_DT_scotty_knotty_ambari_vuln.txt
REC_DT_scotty_knotty_ambari_vuln.txt
ACC_lightGBM_scotty_

In [51]:
for learner in learners:
  for metric in metrics:
    df = pd.DataFrame(combined[learner][metric])
    for dataset in datasets:
      for algo in algos:
        df[dataset][algo] = f"({df[dataset][algo][1]})" if df[dataset][algo][0] else str(df[dataset][algo][1])
    file_name = f'{learner}_{metric}.csv'
    print(file_name)
    df.to_csv("results/combined/"+file_name)


lightGBM_ACC.csv
lightGBM_AUC_ROC.csv
lightGBM_F1.csv
lightGBM_PREC.csv
lightGBM_REC.csv
GB_ACC.csv
GB_AUC_ROC.csv
GB_F1.csv
GB_PREC.csv
GB_REC.csv
LR_ACC.csv
LR_AUC_ROC.csv
LR_F1.csv
LR_PREC.csv
LR_REC.csv
SVM_ACC.csv
SVM_AUC_ROC.csv
SVM_F1.csv
SVM_PREC.csv
SVM_REC.csv
DT_ACC.csv
DT_AUC_ROC.csv
DT_F1.csv
DT_PREC.csv
DT_REC.csv


In [52]:
!zip -r results.zip ./results/


updating: results/ (stored 0%)
updating: results/ratios/ (stored 0%)
updating: results/ratios/F1_LR_eclipse_PDE.csv (deflated 55%)
updating: results/ratios/ACC_SVM_moodle_vuln.csv (deflated 57%)
updating: results/ratios/AUC_ROC_GB_defect_mylyn.csv (deflated 56%)
updating: results/ratios/REC_lightGBM_js_vuln.csv (deflated 55%)
updating: results/ratios/F1_SVM_eclipse_JDT.csv (deflated 53%)
updating: results/ratios/AUC_ROC_SVM_moodle_vuln.csv (deflated 55%)
updating: results/ratios/PREC_LR_defect_mylyn.csv (deflated 52%)
updating: results/ratios/PREC_lightGBM_js_vuln.csv (deflated 52%)
updating: results/ratios/PREC_LR_eclipse_PDE.csv (deflated 54%)
updating: results/ratios/ACC_lightGBM_js_vuln.csv (deflated 58%)
updating: results/ratios/F1_GB_eclipse_PDE.csv (deflated 57%)
updating: results/ratios/PREC_LR_moodle_vuln.csv (deflated 55%)
updating: results/ratios/AUC_ROC_LR_js_vuln.csv (deflated 59%)
updating: results/ratios/REC_LR_moodle_vuln.csv (deflated 56%)
updating: results/ratios/AUC_

In [27]:
order = [ "random_oversampling",
    "smote",
    "svm_smote",
    "gaussian_copula",
    "RRP",
    "random_pruning",
    "intelligent_pruning"]

In [60]:
!curl -o test.csv https://raw.githubusercontent.com/adipai/data-decent/main/results/formatted_results/ratios/F1_GB_defect_mylyn.csv
!head test.csv -n 2 | wc -l

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   368  100   368    0     0   1476      0 --:--:-- --:--:-- --:--:--  1477
2


In [61]:
df = pd.read_csv('test.csv', index_col=0)
df = df.reindex(order)
df

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,0.0,0.2,0.4,0.6,0.8,1.0
random_oversampling,-inf,0.35,0.36,0.47,0.45,0.41
smote,-inf,0.34,0.39,0.47,0.43,0.46
svm_smote,-inf,0.44,0.44,0.43,0.46,0.44
gaussian_copula,-inf,0.34,0.36,0.35,0.36,0.38
RRP,-inf,0.38,0.38,0.38,0.41,0.42
random_pruning,-inf,0.36,0.4,0.42,0.46,0.44
intelligent_pruning,-inf,0.4,0.4,0.46,(0.49),(0.5)


In [53]:
!pwd

/content
