<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/src/analyse_scott_knott.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re

# Define regex patterns for each algorithm
patterns = {
    'RRP': r'RRP_(\d+\.\d+),\s+(\d+\.\d+)',
    'intelligent_pruning': r'intelligent_pruning(\d+\.\d+_\d+\.\d+),\s+(\d+\.\d+),\s+(\d+\.\d+)',
    'random_pruning': r'random_pruning_(\d+\.\d+),\s+(\d+\.\d+)',
    'svm': r'svm_(?!smote)(\d+\.\d+),\s+(\d+\.\d+)',
    'svm_smote': r'svm_smote_(\d+\.\d+),\s+(\d+\.\d+)',
    'random_oversampling': r'random_oversampling_(\d+\.\d+),\s+(\d+\.\d+)',
    'gaussian_copula': r'gaussian_copula_(\d+\.\d+),\s+(\d+\.\d+)',
    'no_sampling': r'No_Sampling_(\d+\.\d+),\s+(\d+\.\d+)'
}

# Function to count occurrences of algorithms for rank 0 in a file and track top value
def count_algorithms(file_path, counts, top_values):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        if line.startswith(' 0,'):
            for algorithm, pattern in patterns.items():
                match = re.search(pattern, line)
                if match:
                    counts[algorithm] += 1
                    value = float(match.group(2))
                    if top_values[algorithm] is None or value > top_values[algorithm]:
                        top_values[algorithm] = value

    return counts, top_values

# Get a list of all .txt files in the data folder
data_folder = 'data/'
txt_files = [f for f in os.listdir(data_folder) if f.endswith('.txt')]

# Count occurrences of algorithms for rank 0 in each file
algo_counts_files = []
values_dataset = dict()
for txt_file in txt_files:
    file_path = os.path.join(data_folder, txt_file)
    # Initialize counts dictionary
    counts = {
        'RRP': 0,
        'intelligent_pruning': 0,
        'random_pruning': 0,
        'svm': 0,
        'svm_smote': 0,
        'random_oversampling': 0,
        'gaussian_copula': 0,
        'no_sampling':0
    }
    top_values = {
        'RRP': None,
        'intelligent_pruning': None,
        'random_pruning': None,
        'svm': None,
        'svm_smote': None,
        'random_oversampling': None,
        'gaussian_copula': None,
        'no_sampling':None
    }
    counts, top_values = count_algorithms(file_path, counts, top_values)
    algo_counts_files.append(counts)
    start = txt_file.index("knotty_") + len("knotty_")
    end = txt_file.index(".txt")
    dataset_name = txt_file[start:end]

    values_dataset[dataset_name] = top_values

avg_wins_algo = {
    'RRP': 0,
    'intelligent_pruning': 0,
    'random_pruning': 0,
    'svm': 0,
    'svm_smote': 0,
    'random_oversampling': 0,
    'gaussian_copula': 0,
    'no_sampling':0
}
for algo_counts_file in algo_counts_files:
  for algo_name, algo_counts in algo_counts_file.items():
    avg_wins_algo[algo_name] += algo_counts


for algo_name, algo_avg_wins in avg_wins_algo.items():
  avg_wins_algo[algo_name] = int(avg_wins_algo[algo_name] / len(avg_wins_algo))



In [2]:
print(values_dataset)

{'churn': {'RRP': 0.82, 'intelligent_pruning': 0.82, 'random_pruning': 0.87, 'svm': None, 'svm_smote': None, 'random_oversampling': 0.8, 'gaussian_copula': 0.84, 'no_sampling': None}, 'defect_mylyn': {'RRP': 0.58, 'intelligent_pruning': 0.58, 'random_pruning': None, 'svm': None, 'svm_smote': None, 'random_oversampling': 0.56, 'gaussian_copula': None, 'no_sampling': None}, 'js_vuln': {'RRP': None, 'intelligent_pruning': None, 'random_pruning': None, 'svm': None, 'svm_smote': 0.6, 'random_oversampling': 0.58, 'gaussian_copula': None, 'no_sampling': None}, 'Moodle_Vuln': {'RRP': None, 'intelligent_pruning': 0.43, 'random_pruning': 0.43, 'svm': None, 'svm_smote': None, 'random_oversampling': None, 'gaussian_copula': None, 'no_sampling': None}, 'eclipse_PDE': {'RRP': None, 'intelligent_pruning': 0.64, 'random_pruning': 0.59, 'svm': None, 'svm_smote': None, 'random_oversampling': None, 'gaussian_copula': None, 'no_sampling': None}, 'breast_cancer': {'RRP': None, 'intelligent_pruning': 0.61, 

In [3]:
columns = ['breast_cancer', 'churn', 'js_vuln', 'ambari_vuln', 'eclipse_JDT','eclipse_PDE', 'Moodle_Vuln', 'defect_mylyn']

# Create a new dictionary with sorted keys
sorted_values_dataset = {key: values_dataset[key] for key in columns if key in values_dataset}

# print(sorted_values_dataset)
# Define the algorithm order
algorithm_order = ['random_oversampling', 'svm', 'svm_smote', 'gaussian_copula', 'RRP', 'no_sampling', 'random_pruning', 'intelligent_pruning']

# Sort the inner dictionaries based on the algorithm order
sorted_values_dataset = {key: {k: values_dataset[key][k] for k in algorithm_order if k in values_dataset[key]} for key in values_dataset}

print(sorted_values_dataset)


{'churn': {'random_oversampling': 0.8, 'svm': None, 'svm_smote': None, 'gaussian_copula': 0.84, 'RRP': 0.82, 'no_sampling': None, 'random_pruning': 0.87, 'intelligent_pruning': 0.82}, 'defect_mylyn': {'random_oversampling': 0.56, 'svm': None, 'svm_smote': None, 'gaussian_copula': None, 'RRP': 0.58, 'no_sampling': None, 'random_pruning': None, 'intelligent_pruning': 0.58}, 'js_vuln': {'random_oversampling': 0.58, 'svm': None, 'svm_smote': 0.6, 'gaussian_copula': None, 'RRP': None, 'no_sampling': None, 'random_pruning': None, 'intelligent_pruning': None}, 'Moodle_Vuln': {'random_oversampling': None, 'svm': None, 'svm_smote': None, 'gaussian_copula': None, 'RRP': None, 'no_sampling': None, 'random_pruning': 0.43, 'intelligent_pruning': 0.43}, 'eclipse_PDE': {'random_oversampling': None, 'svm': None, 'svm_smote': None, 'gaussian_copula': None, 'RRP': None, 'no_sampling': None, 'random_pruning': 0.59, 'intelligent_pruning': 0.64}, 'breast_cancer': {'random_oversampling': None, 'svm': None, 

In [4]:
import pandas as pd
# Create the DataFrame
df = pd.DataFrame(sorted_values_dataset)

# Sort the columns of the DataFrame
df = df.reindex(columns=columns)

print(df)

                     breast_cancer  churn  js_vuln  ambari_vuln  eclipse_JDT  \
random_oversampling            NaN   0.80     0.58          NaN          NaN   
svm                            NaN    NaN      NaN          NaN          NaN   
svm_smote                      NaN    NaN     0.60          NaN          NaN   
gaussian_copula                NaN   0.84      NaN          NaN          NaN   
RRP                            NaN   0.82      NaN          NaN         0.70   
no_sampling                    NaN    NaN      NaN          NaN          NaN   
random_pruning                0.53   0.87      NaN          0.6         0.68   
intelligent_pruning           0.61   0.82      NaN          NaN         0.72   

                     eclipse_PDE  Moodle_Vuln  defect_mylyn  
random_oversampling          NaN          NaN          0.56  
svm                          NaN          NaN           NaN  
svm_smote                    NaN          NaN           NaN  
gaussian_copula              Na

In [5]:
print(avg_wins_algo)

{'RRP': 0, 'intelligent_pruning': 1, 'random_pruning': 0, 'svm': 0, 'svm_smote': 0, 'random_oversampling': 0, 'gaussian_copula': 0, 'no_sampling': 0}
