<a href="https://colab.research.google.com/github/adipai/statistical-data-pruning-analysis/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd
import glob

## Scott-Knott analysis

In [66]:
def of(s):
    try: return float(s)
    except ValueError: return s

def slurp(file):
  nums,lst,last= [],[],None
  with open(file) as fp:
    for word in [of(x) for s in fp.readlines() for x in s.split()]:
      if isinstance(word,float):
        lst += [word]
      else:
        if len(lst)>0: nums += [SAMPLE(lst,last)]
        lst,last =[],word
  if len(lst)>0: nums += [SAMPLE(lst,last)]
  return nums

class SAMPLE:
  "stores mean, standard deviation, low, high, of a list of numbers"
  def __init__(self,lst=[],txt="",rank=0):
    self.has, self.ready = [],False
    self.txt, self.rank = txt,0
    self.n, self.sd, self.m2,self.mu, self.lo, self.hi = 0,0,0,0,sys.maxsize, -sys.maxsize
    [self.add(x) for x in lst]

  def add(self,x):
    self.has += [x]; self.ready=False;
    self.lo = min(x,self.lo)
    self.hi = max(x,self.hi)
    self.n += 1
    delta = x - self.mu
    self.mu += delta / self.n
    self.m2 += delta * (x -  self.mu)
    self.sd = 0 if self.n < 2 else (self.m2 / (self.n - 1))**.5

  def ok(self):
    if not self.ready:
      self.has = sorted(self.has)
    self.ready=True
    return self

  def mid(self):
    has=self.ok().has
    return has[len(has)//2]

  def bar(self, num, fmt="%8.3f", word="%10s", width=50):
    out  = [' '] * width
    pos = lambda x: int(width * (x - self.lo) / (self.hi - self.lo + 1E-30))
    has = num.ok().has
    [a, b, c, d, e]  = [has[int(len(has)*x)] for x in [0.5,0.25,0.5,0.75,0.95]]
    [na,nb,nc,nd,ne] = [pos(x) for x in [a,b,c,d,e]]
    for i in range(nb,nd): out[i] = "-"
    #for i in range(nd,ne): out[i] = "-"
    out[width//2] = "|"
    out[nc] = "*"
    return ', '.join(["%2d" % num.rank, word % num.txt, fmt%c, fmt%(d-b),
                      ''.join(out), fmt%self.lo,      fmt%self.hi ]) #, ', '.join([(fmt % x) for x in [a,b,c,d,e]])])

def different(x,y):
  "non-parametric effect size and significance test"
  return _cliffsDelta(x,y) and _bootstrap(x,y)

def _cliffsDelta(x, y, effectSize=0.2):
  """non-parametric effect size. threshold is border between small=.11 and medium=.28
     from Table1 of  https://doi.org/10.3102/10769986025002101"""
  #if len(x) > 10*len(y) : return cliffsDelta(random.choices(x,10*len(y)),y)
  #if len(y) > 10*len(x) : return cliffsDelta(x, random.choices(y,10*len(x)))
  n,lt,gt = 0,0,0
  for x1 in x:
    for y1 in y:
      n += 1
      if x1 > y1: gt += 1
      if x1 < y1: lt += 1
  return abs(lt - gt)/n  > effectSize # true if different

def _bootstrap(y0,z0,confidence=.05,Experiments=512,):
  """non-parametric significance test From Introduction to Bootstrap,
     Efron and Tibshirani, 1993, chapter 20. https://doi.org/10.1201/9780429246593"""
  obs = lambda x,y: abs(x.mu-y.mu) / ((x.sd**2/x.n + y.sd**2/y.n)**.5 + 1E-30)
  x, y, z = SAMPLE(y0+z0), SAMPLE(y0), SAMPLE(z0)
  d = obs(y,z)
  yhat = [y1 - y.mu + x.mu for y1 in y0]
  zhat = [z1 - z.mu + x.mu for z1 in z0]
  n      = 0
  for _ in range(Experiments):
    ynum = SAMPLE(random.choices(yhat,k=len(yhat)))
    znum = SAMPLE(random.choices(zhat,k=len(zhat)))
    if obs(ynum, znum) > d:
      n += 1
  return n / Experiments < confidence # true if different

def sk(nums):
  "sort nums on median. give adjacent nums the same rank if they are statistically the same"
  def sk1(nums, rank,lvl=1):
    all = lambda lst:  [x for num in lst for x in num.has]
    b4, cut = SAMPLE(all(nums)) ,None
    max =  -1
    for i in range(1,len(nums)):
      lhs = SAMPLE(all(nums[:i]));
      rhs = SAMPLE(all(nums[i:]));
      tmp = (lhs.n*abs(lhs.mid() - b4.mid()) + rhs.n*abs(rhs.mid() - b4.mid()))/b4.n
      if tmp > max:
         max,cut = tmp,i
    if cut and different( all(nums[:cut]), all(nums[cut:])):
      rank = sk1(nums[:cut], rank, lvl+1) + 1
      rank = sk1(nums[cut:], rank, lvl+1)
    else:
      for num in nums: num.rank = rank
    return rank
  #------------
  nums = sorted(nums, key=lambda num:num.mid())
  sk1(nums,0)
  return nums

def eg0(nums):
  all = SAMPLE([x for num in nums for x in num.has])
  last = None
  for num in sk(nums):
    if num.rank != last: print("#")
    last=num.rank
    print(all.bar(num,width=40,word="%20s", fmt="%5.2f"))

## Read all experiment results

In [67]:
# Get a list of file paths for the CSV files
file_paths = glob.glob('data/*.csv')  # Update 'path_to_files' with the directory containing your CSV files

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each file path and read the CSV into a DataFrame
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Print the combined DataFrame
df

Unnamed: 0,algo,ratio,seed,learner,acc,prec,rec,f1,auc_roc
0,random_pruning,0.2,82,LR,0.724138,0.583333,0.388889,0.466667,0.697222
1,random_pruning,0.2,82,SVM,0.741379,0.800000,0.222222,0.347826,0.569444
2,random_pruning,0.2,82,DT,0.568966,0.333333,0.388889,0.358974,0.555556
3,random_pruning,0.4,82,LR,0.689655,0.500000,0.333333,0.400000,0.694444
4,random_pruning,0.4,82,SVM,0.741379,0.800000,0.222222,0.347826,0.591667
...,...,...,...,...,...,...,...,...,...
895,smote,0.8,87,SVM,0.724138,0.545455,0.666667,0.600000,0.722222
896,smote,0.8,87,DT,0.706897,0.526316,0.555556,0.540541,0.681944
897,smote,1.0,87,LR,0.620690,0.416667,0.555556,0.476190,0.640278
898,smote,1.0,87,SVM,0.655172,0.461538,0.666667,0.545455,0.694444


#### Data transformation

In [68]:
# Group data by 'learner', 'algo', and 'ratio' columns
grouped_data = df.groupby(['learner', 'algo', 'ratio'])

# Initialize a nested dictionary to store the aggregated 'acc' values for each learner and algo_ratio combination
acc_values_dict = {}

# Iterate over each group
for group_name, group_data in grouped_data:
    # Get the unique seed values in the group
    unique_seeds = group_data['seed'].unique()
    # If the number of unique seeds is 10, aggregate 'acc' values into a list
    if len(unique_seeds) == 10:
        acc_values = group_data['acc'].tolist()
        # Create a key in the dictionary combining algo and ratio
        key = f"{group_name[1]}_{group_name[2]}"
        # Check if the learner exists in the nested dictionary, if not, create it
        if group_name[0] not in acc_values_dict:
            acc_values_dict[group_name[0]] = {}
        # Add the 'acc' values list to the dictionary under the key
        acc_values_dict[group_name[0]][key] = acc_values

print(acc_values_dict)


{'DT': {'RRP_0.2': [0.603448275862069, 0.6551724137931034, 0.6551724137931034, 0.6379310344827587, 0.6724137931034483, 0.6724137931034483, 0.6551724137931034, 0.6896551724137931, 0.603448275862069, 0.7241379310344828], 'RRP_0.4': [0.6379310344827587, 0.5172413793103449, 0.6551724137931034, 0.6551724137931034, 0.6551724137931034, 0.6379310344827587, 0.6379310344827587, 0.7586206896551724, 0.6551724137931034, 0.6724137931034483], 'RRP_0.6': [0.6551724137931034, 0.7241379310344828, 0.7068965517241379, 0.5344827586206896, 0.6551724137931034, 0.6896551724137931, 0.6724137931034483, 0.7068965517241379, 0.6379310344827587, 0.7586206896551724], 'RRP_0.8': [0.603448275862069, 0.6551724137931034, 0.6896551724137931, 0.7068965517241379, 0.603448275862069, 0.6724137931034483, 0.6379310344827587, 0.6551724137931034, 0.6206896551724138, 0.7241379310344828], 'RRP_1.0': [0.5689655172413793, 0.6379310344827587, 0.6551724137931034, 0.6206896551724138, 0.6551724137931034, 0.7586206896551724, 0.5862068965

In [69]:
# Run Scott-Knott analysis on complete results obtained on DT

sample_list = []
for key,value in acc_values_dict['DT'].items():
  sample_list.append(SAMPLE(value,str(key)))

eg0(sample_list)

#
 0,   random_pruning_1.0,  0.55,  0.10,       --------*---  |                   ,  0.43,  0.78
#
 1,   random_pruning_0.6,  0.59,  0.09,               ----*-|---                ,  0.43,  0.78
 1,   random_pruning_0.8,  0.60,  0.03,                 ----*                   ,  0.43,  0.78
 1,  gaussian_copula_0.4,  0.62,  0.05,                     |-*---              ,  0.43,  0.78
 1,  gaussian_copula_0.8,  0.62,  0.07,                   --|-*---              ,  0.43,  0.78
 1,  gaussian_copula_1.0,  0.62,  0.07,               ------|-*                 ,  0.43,  0.78
 1, random_oversampling_0.6,  0.62,  0.10,                     |-*---------        ,  0.43,  0.78
 1,            smote_0.4,  0.62,  0.09,                     |-*-------          ,  0.43,  0.78
 1,            smote_0.6,  0.62,  0.12,               ------|-*-----            ,  0.43,  0.78
 1,  gaussian_copula_0.2,  0.64,  0.03,                     | --*-              ,  0.43,  0.78
#
 2,  gaussian_copula_0.6,  0.64,  0.05,  

In [70]:
# Run Scott-Knott analysis on complete results obtained on SVM

sample_list = []
for key,value in acc_values_dict['SVM'].items():
  sample_list.append(SAMPLE(value,str(key)))

eg0(sample_list)

#
 0,   random_pruning_1.0,  0.64,  0.07,          ------*-   |                   ,  0.50,  0.86
 0,            smote_1.0,  0.64,  0.09,          ------*--- |                   ,  0.50,  0.86
 0,              RRP_1.0,  0.66,  0.05,                --*--|                   ,  0.50,  0.86
 0,  gaussian_copula_0.8,  0.66,  0.05,                --*--|                   ,  0.50,  0.86
#
 1,  gaussian_copula_1.0,  0.66,  0.07,            ------*- |                   ,  0.50,  0.86
 1,            smote_0.8,  0.67,  0.10,              ------*|---                ,  0.50,  0.86
#
 2,  gaussian_copula_0.4,  0.69,  0.05,                    -*---                ,  0.50,  0.86
 2,  gaussian_copula_0.6,  0.69,  0.03,                  ---*                   ,  0.50,  0.86
 2, random_oversampling_1.0,  0.69,  0.03,                  ---*                   ,  0.50,  0.86
 2,   random_pruning_0.8,  0.69,  0.02,                    -*                   ,  0.50,  0.86
#
 3,        svm_smote_1.0,  0.69,  0.09,

In [71]:
# Run Scott-Knott analysis on complete results obtained on LR

sample_list = []
for key,value in acc_values_dict['LR'].items():
  sample_list.append(SAMPLE(value,str(key)))

eg0(sample_list)

#
 0,            smote_1.0,  0.62,  0.03,        -----*       |                   ,  0.53,  0.81
#
 1,              RRP_1.0,  0.64,  0.03,                *----|                   ,  0.53,  0.81
 1,   random_pruning_1.0,  0.64,  0.07,        --------*-   |                   ,  0.53,  0.81
 1, random_oversampling_1.0,  0.66,  0.12,        ----------*--|----               ,  0.53,  0.81
 1,            smote_0.8,  0.66,  0.05,             -----*--|                   ,  0.53,  0.81
#
 2,        svm_smote_0.8,  0.66,  0.09,                --*--|------             ,  0.53,  0.81
 2,        svm_smote_1.0,  0.66,  0.07,             -----*--|-                  ,  0.53,  0.81
 2,  gaussian_copula_1.0,  0.67,  0.02,                  ---*                   ,  0.53,  0.81
 2, random_oversampling_0.6,  0.67,  0.09,                -----*------             ,  0.53,  0.81
#
 3,   random_pruning_0.8,  0.67,  0.05,                -----*-                  ,  0.53,  0.81
 3,              RRP_0.6,  0.69,  0.