<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/src/analyse_scott_knott.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import os
import re

# Define regex patterns for each algorithm
patterns = {
    'RRP': r'RRP_(\d+\.\d+),\s+(\d+\.\d+)',
    'intelligent_pruning': r'intelligent_pruning(\d+\.\d+_\d+\.\d+),\s+(\d+\.\d+),\s+(\d+\.\d+)',
    'random_pruning': r'random_pruning_(\d+\.\d+),\s+(\d+\.\d+)',
    'smote': r'smote_(\d+\.\d+),\s+(\d+\.\d+)',
    'svm_smote': r'svm_smote_(\d+\.\d+),\s+(\d+\.\d+)',
    'random_oversampling': r'random_oversampling_(\d+\.\d+),\s+(\d+\.\d+)',
    'gaussian_copula': r'gaussian_copula_(\d+\.\d+),\s+(\d+\.\d+)',
    'no_sampling': r'No_Sampling_(\d+\.\d+),\s+(\d+\.\d+)'
}

# Function to count occurrences of algorithms for rank 0 in a file and track top value
def count_algorithms(file_path, counts, top_values):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        if line.startswith(' 0,'):
            for algorithm, pattern in patterns.items():
                match = re.search(pattern, line)
                if match:
                    counts[algorithm] += 1
                    value = float(match.group(2))
                    if top_values[algorithm] is None or value > top_values[algorithm]:
                        top_values[algorithm] = value

    return counts, top_values

# Get a list of all .txt files in the data folder
data_folder = 'data/'
txt_files = [f for f in os.listdir(data_folder) if f.endswith('.txt')]

# Count occurrences of algorithms for rank 0 in each file
algo_counts_files = []
values_dataset = dict()
for txt_file in txt_files:
    file_path = os.path.join(data_folder, txt_file)
    # Initialize counts dictionary
    counts = {
        'RRP': 0,
        'intelligent_pruning': 0,
        'random_pruning': 0,
        'smote': 0,
        'svm_smote': 0,
        'random_oversampling': 0,
        'gaussian_copula': 0,
        'no_sampling':0
    }
    top_values = {
        'RRP': None,
        'intelligent_pruning': None,
        'random_pruning': None,
        'smote': None,
        'svm_smote': None,
        'random_oversampling': None,
        'gaussian_copula': None,
        'no_sampling':None
    }
    counts, top_values = count_algorithms(file_path, counts, top_values)
    algo_counts_files.append(counts)
    start = txt_file.index("knotty_") + len("knotty_")
    end = txt_file.index(".txt")
    dataset_name = txt_file[start:end]

    values_dataset[dataset_name] = top_values

avg_wins_algo = {
    'RRP': 0,
    'intelligent_pruning': 0,
    'random_pruning': 0,
    'smote': 0,
    'svm_smote': 0,
    'random_oversampling': 0,
    'gaussian_copula': 0,
    'no_sampling':0
}

for file_contents in algo_counts_files:
  for algo_name, algo_counts in file_contents.items():
    if(algo_counts == None):
      continue
    avg_wins_algo[algo_name] += algo_counts


for algo_name, algo_avg_wins in avg_wins_algo.items():
  avg_wins_algo[algo_name] = avg_wins_algo[algo_name] / len(algo_counts_files)


In [49]:
columns = ['breast_cancer', 'churn', 'js_vuln', 'ambari_vuln', 'eclipse_JDT','eclipse_PDE', 'Moodle_Vuln', 'defect_mylyn']

# Create a new dictionary with sorted keys
sorted_values_dataset = {key: values_dataset[key] for key in columns if key in values_dataset}

# print(sorted_values_dataset)
# Define the algorithm order
algorithm_order = ['random_oversampling', 'smote', 'svm_smote', 'gaussian_copula', 'RRP', 'no_sampling', 'random_pruning', 'intelligent_pruning']

# Sort the inner dictionaries based on the algorithm order
sorted_values_dataset = {key: {k: values_dataset[key][k] for k in algorithm_order if k in values_dataset[key]} for key in values_dataset}

print(sorted_values_dataset)


{'churn': {'random_oversampling': 0.27, 'smote': 0.26, 'svm_smote': 0.26, 'gaussian_copula': 0.26, 'RRP': 0.27, 'no_sampling': 0.26, 'random_pruning': 0.26, 'intelligent_pruning': 0.27}, 'js_vuln': {'random_oversampling': 0.29, 'smote': 0.29, 'svm_smote': 0.29, 'gaussian_copula': 0.29, 'RRP': 0.29, 'no_sampling': 0.29, 'random_pruning': 0.29, 'intelligent_pruning': 0.29}, 'Moodle_Vuln': {'random_oversampling': 0.06, 'smote': 0.06, 'svm_smote': None, 'gaussian_copula': 0.05, 'RRP': None, 'no_sampling': None, 'random_pruning': 0.05, 'intelligent_pruning': 0.05}, 'breast_cancer': {'random_oversampling': 0.35, 'smote': 0.38, 'svm_smote': 0.34, 'gaussian_copula': 0.34, 'RRP': None, 'no_sampling': 0.34, 'random_pruning': 0.35, 'intelligent_pruning': 0.44}, 'defect_mylyn': {'random_oversampling': 0.31, 'smote': 0.31, 'svm_smote': 0.31, 'gaussian_copula': 0.31, 'RRP': 0.3, 'no_sampling': 0.31, 'random_pruning': 0.31, 'intelligent_pruning': 0.31}, 'eclipse_JDT': {'random_oversampling': 0.47, 's

In [50]:
import pandas as pd
# Create the DataFrame
df = pd.DataFrame(sorted_values_dataset)

# Sort the columns of the DataFrame
df = df.reindex(columns=columns)

print(df)

                     breast_cancer  churn  js_vuln  ambari_vuln  eclipse_JDT  \
random_oversampling           0.35   0.27     0.29          NaN         0.47   
smote                         0.38   0.26     0.29          NaN         0.48   
svm_smote                     0.34   0.26     0.29          NaN         0.48   
gaussian_copula               0.34   0.26     0.29         0.08         0.47   
RRP                            NaN   0.27     0.29          NaN         0.47   
no_sampling                   0.34   0.26     0.29          NaN         0.45   
random_pruning                0.35   0.26     0.29         0.08         0.46   
intelligent_pruning           0.44   0.27     0.29         0.08         0.47   

                     eclipse_PDE  Moodle_Vuln  defect_mylyn  
random_oversampling         0.28         0.06          0.31  
smote                       0.28         0.06          0.31  
svm_smote                   0.28          NaN          0.31  
gaussian_copula             0.2

In [51]:
print(avg_wins_algo)

{'RRP': 2.5, 'intelligent_pruning': 18.0, 'random_pruning': 4.5, 'smote': 7.625, 'svm_smote': 3.75, 'random_oversampling': 3.875, 'gaussian_copula': 4.5, 'no_sampling': 0.75}
