<a href="https://colab.research.google.com/github/adipai/data-decent/blob/main/src/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import glob
import sys
import random
from io import StringIO

In [2]:
# Give dataset identifier here
dataset_name = "Moodle_Vuln"

## Scott-Knott analysis

In [3]:
def of(s):
    try: return float(s)
    except ValueError: return s

def slurp(file):
  nums,lst,last= [],[],None
  with open(file) as fp:
    for word in [of(x) for s in fp.readlines() for x in s.split()]:
      if isinstance(word,float):
        lst += [word]
      else:
        if len(lst)>0: nums += [SAMPLE(lst,last)]
        lst,last =[],word
  if len(lst)>0: nums += [SAMPLE(lst,last)]
  return nums

class SAMPLE:
  "stores mean, standard deviation, low, high, of a list of numbers"
  def __init__(self,lst=[],txt="",rank=0):
    self.has, self.ready = [],False
    self.txt, self.rank = txt,0
    self.n, self.sd, self.m2,self.mu, self.lo, self.hi = 0,0,0,0,sys.maxsize, -sys.maxsize
    [self.add(x) for x in lst]

  def add(self,x):
    self.has += [x]; self.ready=False;
    self.lo = min(x,self.lo)
    self.hi = max(x,self.hi)
    self.n += 1
    delta = x - self.mu
    self.mu += delta / self.n
    self.m2 += delta * (x -  self.mu)
    self.sd = 0 if self.n < 2 else (self.m2 / (self.n - 1))**.5

  def ok(self):
    if not self.ready:
      self.has = sorted(self.has)
    self.ready=True
    return self

  def mid(self):
    has=self.ok().has
    return has[len(has)//2]

  def bar(self, num, fmt="%8.3f", word="%10s", width=50):
    out  = [' '] * width
    pos = lambda x: int(width * (x - self.lo) / (self.hi - self.lo + 1E-30))
    has = num.ok().has
    [a, b, c, d, e]  = [has[int(len(has)*x)] for x in [0.5,0.25,0.5,0.75,0.95]]
    [na,nb,nc,nd,ne] = [pos(x) for x in [a,b,c,d,e]]
    for i in range(nb,nd): out[i] = "-"
    #for i in range(nd,ne): out[i] = "-"
    out[width//2] = "|"
    out[nc] = "*"
    return ', '.join(["%2d" % num.rank, word % num.txt, fmt%c, fmt%(d-b),
                      ''.join(out), fmt%self.lo,      fmt%self.hi ]) #, ', '.join([(fmt % x) for x in [a,b,c,d,e]])])

def different(x,y):
  "non-parametric effect size and significance test"
  return _cliffsDelta(x,y) and _bootstrap(x,y)

def _cliffsDelta(x, y, effectSize=0.2):
  """non-parametric effect size. threshold is border between small=.11 and medium=.28
     from Table1 of  https://doi.org/10.3102/10769986025002101"""
  #if len(x) > 10*len(y) : return cliffsDelta(random.choices(x,10*len(y)),y)
  #if len(y) > 10*len(x) : return cliffsDelta(x, random.choices(y,10*len(x)))
  n,lt,gt = 0,0,0
  for x1 in x:
    for y1 in y:
      n += 1
      if x1 > y1: gt += 1
      if x1 < y1: lt += 1
  return abs(lt - gt)/n  > effectSize # true if different

def _bootstrap(y0,z0,confidence=.05,Experiments=512,):
  """non-parametric significance test From Introduction to Bootstrap,
     Efron and Tibshirani, 1993, chapter 20. https://doi.org/10.1201/9780429246593"""
  obs = lambda x,y: abs(x.mu-y.mu) / ((x.sd**2/x.n + y.sd**2/y.n)**.5 + 1E-30)
  x, y, z = SAMPLE(y0+z0), SAMPLE(y0), SAMPLE(z0)
  d = obs(y,z)
  yhat = [y1 - y.mu + x.mu for y1 in y0]
  zhat = [z1 - z.mu + x.mu for z1 in z0]
  n      = 0
  for _ in range(Experiments):
    ynum = SAMPLE(random.choices(yhat,k=len(yhat)))
    znum = SAMPLE(random.choices(zhat,k=len(zhat)))
    if obs(ynum, znum) > d:
      n += 1
  return n / Experiments < confidence # true if different

def sk(nums):
  "sort nums on median. give adjacent nums the same rank if they are statistically the same"
  def sk1(nums, rank,lvl=1):
    all = lambda lst:  [x for num in lst for x in num.has]
    b4, cut = SAMPLE(all(nums)) ,None
    max =  -1
    for i in range(1,len(nums)):
      lhs = SAMPLE(all(nums[:i]));
      rhs = SAMPLE(all(nums[i:]));
      tmp = (lhs.n*abs(lhs.mid() - b4.mid()) + rhs.n*abs(rhs.mid() - b4.mid()))/b4.n
      if tmp > max:
         max,cut = tmp,i
    if cut and different( all(nums[:cut]), all(nums[cut:])):
      rank = sk1(nums[:cut], rank, lvl+1) + 1
      rank = sk1(nums[cut:], rank, lvl+1)
    else:
      for num in nums: num.rank = rank
    return rank
  #------------
  nums = sorted(nums, key=lambda num:num.mid(), reverse=True)
  sk1(nums,0)
  return nums

def eg0(nums):
  all = SAMPLE([x for num in nums for x in num.has])
  last = None
  for num in sk(nums):
    if num.rank != last: print("#")
    last=num.rank
    print(all.bar(num,width=40,word="%20s", fmt="%5.2f"))

## Read all experiment results

In [5]:
# Get a list of file paths for the CSV files
file_paths = glob.glob('data/*.csv')  # Update 'path_to_files' with the directory containing your CSV files

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each file path and read the CSV into a DataFrame
for file_path in file_paths:

    df = pd.read_csv(file_path)

    if 'intelligent_pruning_results.csv' in file_path:

      # Convert "per_cluster_pruning_ratio" column to string
      df['per_cluster_pruning_ratio'] = df['per_cluster_pruning_ratio'].astype(str)

      # Append the value of "per_cluster_pruning_ratio" to the corresponding value in "algo" column
      df['algo'] += df['per_cluster_pruning_ratio']

      # Drop the "per_cluster_pruning_ratio" column
      df.drop(columns=['per_cluster_pruning_ratio'], inplace=True)

    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Print the combined DataFrame
df

Unnamed: 0,algo,ratio,seed,learner,acc,prec,rec,f1,auc_roc
0,svm_smote,0.2,82,LR,0.963592,0.000000,0.00,0.000000,0.643995
1,svm_smote,0.2,82,SVM,0.978155,0.000000,0.00,0.000000,0.555147
2,svm_smote,0.2,82,DT,0.963592,0.076923,0.25,0.117647,0.610294
3,svm_smote,0.4,82,LR,0.958738,0.000000,0.00,0.000000,0.648897
4,svm_smote,0.4,82,SVM,0.978155,0.000000,0.00,0.000000,0.474265
...,...,...,...,...,...,...,...,...,...
1525,No_Sampling,0.0,14,SVM,0.983010,0.000000,0.00,0.000000,0.499118
1526,No_Sampling,0.0,14,DT,0.978155,0.000000,0.00,0.000000,0.497531
1527,No_Sampling,0.0,87,LR,0.987864,0.000000,0.00,0.000000,0.775430
1528,No_Sampling,0.0,87,SVM,0.987864,0.000000,0.00,0.000000,0.352334


#### Data transformation

In [9]:
# Group data by 'learner', 'algo', and 'ratio' columns
grouped_data = df.groupby(['learner', 'algo', 'ratio'])


In [13]:
# Define a function to capture the standard output
def capture_output(func, *args, **kwargs):
    # Redirect the standard output to a StringIO object
    old_stdout = sys.stdout
    sys.stdout = result = StringIO()

    # Call the function with the provided arguments and keyword arguments
    func(*args, **kwargs)

    # Reset the standard output
    sys.stdout = old_stdout

    # Get the captured output
    captured_output = result.getvalue()

    # Print the captured output
    # print(captured_output)

    # Return the captured output
    return captured_output

In [8]:
metrics = ['acc', 'prec', 'rec', 'f1', 'auc_roc']
learners = ['DT', 'SVM', 'LR']

In [14]:
for metric in metrics:

  # Initialize a nested dictionary to store the aggregated metric values for each learner and algo_ratio combination
  acc_values_dict = {}

  # Iterate over each group
  for group_name, group_data in grouped_data:
      # Get the unique seed values in the group
      unique_seeds = group_data['seed'].unique()
      # If the number of unique seeds is 10, aggregate metric values into a list
      if len(unique_seeds) == 10:
          acc_values = group_data[metric].tolist()
          # Create a key in the dictionary combining algo and ratio
          key = f"{group_name[1]}_{group_name[2]}"
          # Check if the learner exists in the nested dictionary, if not, create it
          if group_name[0] not in acc_values_dict:
              acc_values_dict[group_name[0]] = {}
          # Add the metric values list to the dictionary under the key
          acc_values_dict[group_name[0]][key] = acc_values

  for learner in learners:
    sample_list = []
    for key,value in acc_values_dict[learner].items():
      sample_list.append(SAMPLE(value,str(key)))

    # Call the function and capture the output
    output = capture_output(eg0, sample_list)

    # Write the captured output to a text file
    with open(f"{metric.upper()}_{learner}_scotty_knotty_{dataset_name}.txt", 'w') as f:
        f.write(output)
# print(acc_values_dict)