In [None]:
import numpy as np
import pandas as pd
import os

# Get the current directory 
current_dir = os.path.dirname(os.getcwd())



## Loading CSVs

In [14]:
reviewers_file_path = os.path.join(current_dir, 'analysis/assessment-team-table.csv')
reviewers_pd = pd.read_csv(reviewers_file_path)
reviewers_pd = reviewers_pd.loc[:, ~reviewers_pd.columns.str.contains('^Unnamed')]

ml_xlsx_file = os.path.join(current_dir, 'analysis/rq1_gridsearch_anovaf_fs-k1200.xlsx')

ml_pd = pd.read_excel(ml_xlsx_file, sheet_name='Probabilities', engine='openpyxl')
ml_pd = ml_pd.loc[:, ~ml_pd.columns.str.contains('^Unnamed')]


In [None]:
# Merge the dataframes on 'uuid' (usind the uuid or the id of the bib entry avoids mistakes caused strings mismatch when comparing different .bib files)
merged_df = pd.merge(ml_pd, reviewers_pd, on=['uuid', 'Was Selected?'], how='outer')
print(len(merged_df))

# Normalize ML proba to compare with reviewers
def normalize_ml_proba(value):
    if value <= 0.5:
        return 0
    elif value <= 0.6:
        return 1
    else:
        return 2
    

# Apply normalization to the 'ML_proba' column
merged_df.rename(columns={'RFOREST_proba': 'RF', 'SVM_proba': 'SVM', 'Was Selected?': 'FR', 'Titles_x': 'Titles'}, inplace=True)
merged_df.drop('Titles_y', axis=1, inplace=True)
merged_df['RF'] = merged_df['RF'].apply(normalize_ml_proba)
merged_df['SVM'] = merged_df['SVM'].apply(normalize_ml_proba)

# Normalize the final results (FR) column (0 - excluded | 2 - included)
merged_df['FR'] = merged_df['FR'] * 2

merged_df.head(5)

551


Unnamed: 0,uuid,Titles,FR,SVM,RF,R1,R2,R3
0,69094027-4a16-4b1a-b3c9-41b45283a8d4,A Large-Scale Empirical Study of Practitioners...,0,0,0,0,0,0
1,9e3430a7-9f4c-4966-aa54-6e81be0f693c,From User-Centered to Adoption-Centered Design...,0,2,0,0,0,0
2,ec2ceb27-5ff3-4109-b252-58a2ca13d5a1,"Fontys ICT, Partners in Education Program: Int...",0,2,2,0,0,0
3,532d077e-ea35-42fa-a22d-8f8432dc5649,Teaching Software Developers to Perform UX Tasks,0,0,0,0,0,0
4,6b70aaf1-1f6d-4903-8c37-2a8fcd9d10e4,Applying Options Thinking to Shape Generativit...,0,2,0,0,0,0


## Calculate the Euclidean Distance

In [None]:

def computeED(df, ml_algo):
    euclidean_distances = dict()
    ML = ml_algo

    # Calculate Euclidean distances between RF, R1, R2, R3 vs FR individually
    # EuclideanDistance(i, FR)
    euclidean_distances['distance_R1_FR'] = np.sqrt(((df['R1'] - df['FR']) ** 2).sum())
    euclidean_distances['distance_R2_FR'] = np.sqrt(((df['R2'] - df['FR']) ** 2).sum())
    euclidean_distances['distance_R3_FR'] = np.sqrt(((df['R3'] - df['FR']) ** 2).sum())
    euclidean_distances[f'distance_{ML}_FR'] = np.sqrt(((df[ML] - df['FR']) ** 2).sum())

    # Calculate Euclidean distances between pairs of RF, R1, R2, R3 vs FR
    # EuclideanDistance(avg(i,j), FR)
    euclidean_distances['distance_pair_R1_R2'] = np.sqrt(((((df['R1'] + df['R2']) / 2) - (df['FR'])) ** 2).sum())
    euclidean_distances['distance_pair_R1_R3'] = np.sqrt(((((df['R1'] + df['R3']) / 2) - (df['FR'])) ** 2).sum())
    euclidean_distances['distance_pair_R2_R3'] = np.sqrt(((((df['R2'] + df['R3']) / 2) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_pair_R1_{ML}'] = np.sqrt(((((df['R1'] + df[ML]) / 2) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_pair_R2_{ML}'] = np.sqrt(((((df['R2'] + df[ML]) / 2) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_pair_R3_{ML}'] = np.sqrt(((((df['R3'] + df[ML]) / 2) - (df['FR'])) ** 2).sum())

    # Calculate Euclidean distances between groups of RF, R1, R2, R3 vs FR
    # EuclideanDistance(avg(i,j,k), FR)
    euclidean_distances['distance_group_R1_R2_R3'] = np.sqrt(((((df['R1'] + df['R2'] + df['R3']) / 3) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_group_{ML}_R2_R3'] = np.sqrt(((((df[ML] + df['R2'] + df['R3']) / 3) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_group_R1_{ML}_R3'] = np.sqrt(((((df['R1'] + df[ML] + df['R3']) / 3) - (df['FR'])) ** 2).sum())
    euclidean_distances[f'distance_group_R1_R2_{ML}'] = np.sqrt(((((df['R1'] + df['R2'] + df[ML]) / 3) - (df['FR'])) ** 2).sum())

    print(f'Euclidean Distances for {ML}:')
    print(f'\tR1 vs FR =  ', (euclidean_distances['distance_R1_FR']))
    print(f'\tR2 vs FR =  ', (euclidean_distances['distance_R2_FR']))
    print(f'\tR3 vs FR =  ', (euclidean_distances['distance_R3_FR']))
    print(f'\t{ML} vs FR =  ', (euclidean_distances[f'distance_{ML}_FR']))
    print(f'\t----------------------------')
    print(f'\tpair(R1,R2) vs FR =  ', (euclidean_distances['distance_pair_R1_R2']))
    print(f'\tpair(R1,R3) vs FR =  ', (euclidean_distances['distance_pair_R1_R3']))
    print(f'\tpair(R2,R3) vs FR =  ', (euclidean_distances['distance_pair_R2_R3']))
    print(f'\tpair(R1,{ML}) vs FR =  ', (euclidean_distances[f'distance_pair_R1_{ML}']))
    print(f'\tpair(R2,{ML}) vs FR =  ', (euclidean_distances[f'distance_pair_R2_{ML}']))
    print(f'\tpair(R3,{ML}) vs FR =  ', (euclidean_distances[f'distance_pair_R3_{ML}']))
    print(f'\t----------------------------')
    print(f'\tgroup(R1,R2,R3) vs FR =  ', (euclidean_distances['distance_group_R1_R2_R3'])) 
    print(f'\tgroup({ML},R2,R3) vs FR =  ', (euclidean_distances[f'distance_group_{ML}_R2_R3'])) 
    print(f'\tgroup(R1,{ML},R3) vs FR =  ', (euclidean_distances[f'distance_group_R1_{ML}_R3'])) 
    print(f'\tgroup(R1,R2,{ML}) vs FR =  ', (euclidean_distances[f'distance_group_R1_R2_{ML}'])) 
    print(f'\t----------------------------')

### Random Forest (RF)

In [25]:
computeED(merged_df, 'RF')

Euclidean Distances for RF:
	R1 vs FR =   12.0
	R2 vs FR =   9.9498743710662
	R3 vs FR =   11.180339887498949
	RF vs FR =   16.673332000533065
	----------------------------
	pair(R1,R2) vs FR =   8.902246907382429
	pair(R1,R3) vs FR =   9.233092656309694
	pair(R2,R3) vs FR =   8.972179222463181
	pair(R1,RF) vs FR =   11.575836902790225
	pair(R2,RF) vs FR =   11.478240283248997
	pair(R3,RF) vs FR =   11.800423721205947
	----------------------------
	group(R1,R2,R3) vs FR =   8.246211251235321
	group(RF,R2,R3) vs FR =   10.022197585581939
	group(R1,RF,R3) vs FR =   9.927515074556954
	group(R1,R2,RF) vs FR =   9.769567259835231
	----------------------------


### SVM

In [26]:
computeED(merged_df, 'SVM')

Euclidean Distances for SVM:
	R1 vs FR =   12.0
	R2 vs FR =   9.9498743710662
	R3 vs FR =   11.180339887498949
	SVM vs FR =   29.614185789921695
	----------------------------
	pair(R1,R2) vs FR =   8.902246907382429
	pair(R1,R3) vs FR =   9.233092656309694
	pair(R2,R3) vs FR =   8.972179222463181
	pair(R1,SVM) vs FR =   17.48570845004571
	pair(R2,SVM) vs FR =   16.837458240482736
	pair(R3,SVM) vs FR =   17.24818831066034
	----------------------------
	group(R1,R2,R3) vs FR =   8.246211251235321
	group(SVM,R2,R3) vs FR =   13.102162671355698
	group(R1,SVM,R3) vs FR =   13.366625103842281
	group(R1,R2,SVM) vs FR =   13.140268896284683
	----------------------------
