In [1]:
import pandas as pd
import numpy as np

In [2]:
d = {'bow': '(VSM,-,X)', 
     'tfidf': '(VSM,+,X)', 
     'lsi': '(LSI,-,-)',
     'lsi_tfidf': '(LSI,+,-)',
     'lsi_linear_combined': '(LSI,-,+)',
     'lsi_tfidf_linear_combined': '(LSI,+,+)',
     'nmf': '(NMF,-,-)',
     'nmf_tfidf': '(NMF,+,-)',
     'nmf_linear_combined': '(NMF,-,+)',
     'nmf_tfidf_linear_combined': '(NMF,+,+)',
     'lda': '(LDA,X,-)',
     'lda_linear_combined': '(LDA,X,+)',
     'bert': '(BERT,X,X)'}

d_DR = {'som': 'SOM',
       'mds': 'MDS',
       'umap': 'UMAP',
       'tsne': 'TSNE'}

In [3]:
inverted_d = {value:key for key, value in d.items()}
inverted_d_DR = {value:key for key, value in d_DR.items()}
print(inverted_d)

{'(VSM,-,X)': 'bow', '(VSM,+,X)': 'tfidf', '(LSI,-,-)': 'lsi', '(LSI,+,-)': 'lsi_tfidf', '(LSI,-,+)': 'lsi_linear_combined', '(LSI,+,+)': 'lsi_tfidf_linear_combined', '(NMF,-,-)': 'nmf', '(NMF,+,-)': 'nmf_tfidf', '(NMF,-,+)': 'nmf_linear_combined', '(NMF,+,+)': 'nmf_tfidf_linear_combined', '(LDA,X,-)': 'lda', '(LDA,X,+)': 'lda_linear_combined', '(BERT,X,X)': 'bert'}


In [4]:
def compute_accuracy(df):
    trustworthiness = df['Trustworthiness'].tolist()
    continuity = df['Continuity'].tolist()
    neighborhood_hit = df['7-Neighborhood Hit'].tolist()
    shephard_diagram_correlation = df['Shephard Diagram Correlation'].tolist()
    
    result = []
    for i in range(len(trustworthiness)):
        result.append(round(trustworthiness[i]/6 + continuity[i]/6 + neighborhood_hit[i]/2 + 0.5*(shephard_diagram_correlation[i]+1)/6,2))
    
    return result

In [5]:
def compute_perception(df):
    distance_consistency = df['Distance consistency'].tolist()
    silhouette_coefficient = df['Silhouette coefficient'].tolist()
    calinski_harabasz = df['Calinski-Harabasz-Index Normalized'].tolist()
    davies_bouldin = df['Davies-Bouldin-Index Normalized']
    
    result = []
    for i in range(len(distance_consistency)):
        result.append(round(distance_consistency[i]/6 + 0.5*(silhouette_coefficient[i] + 1)/6 +
                            (calinski_harabasz[i])/3 + (1-davies_bouldin[i])/3,2))
    
    return result

In [6]:
# reading 20 Newsgroups
df_20newsgroup = pd.read_csv('data/full_res_20_newsgroups.csv')
df_20newsgroup['Calinski-Harabasz-Index Normalized'] = df_20newsgroup['Calinski-Harabasz-Index']/df_20newsgroup['Calinski-Harabasz-Index'].max()
df_20newsgroup['Davies-Bouldin-Index Normalized'] = df_20newsgroup['Davies-Bouldin-Index']/df_20newsgroup['Davies-Bouldin-Index'].max()

df_20newsgroup['accuracy'] = compute_accuracy(df_20newsgroup)
df_20newsgroup['perception'] = compute_perception(df_20newsgroup)
df_20newsgroup.head(10)

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 800}},mds,bert,0.000457,0.118467,0.34,0.38
1,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 340}},mds,bert,0.000457,0.118467,0.34,0.38
2,20_newsgroups_bert_n_categories_20_mds,0.531983,0.472036,-0.304442,1.097868,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 680}},mds,bert,0.000457,0.118467,0.32,0.38
3,20_newsgroups_bert_n_categories_20_mds,0.531983,0.472036,-0.304442,1.097868,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 320}},mds,bert,0.000457,0.118467,0.32,0.38
4,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 500}},mds,bert,0.000457,0.118467,0.34,0.38
5,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 740}},mds,bert,0.000457,0.118467,0.34,0.38
6,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 580}},mds,bert,0.000457,0.118467,0.34,0.38
7,20_newsgroups_bert_n_categories_20_mds,0.531983,0.472036,-0.304442,1.097868,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 440}},mds,bert,0.000457,0.118467,0.32,0.38
8,20_newsgroups_bert_n_categories_20_mds,0.512018,0.508179,0.001672,7.18281,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 760}},mds,bert,0.000457,0.118467,0.35,0.38
9,20_newsgroups_bert_n_categories_20_mds,0.506364,0.499894,-0.061213,0.293561,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 540}},mds,bert,0.000457,0.118467,0.34,0.38


In [7]:
df_20newsgroup['Dataset'] = '20 Newsgroup'
df_20newsgroup_short = df_20newsgroup.loc[:, ['Dataset', 'accuracy']]
new_names_DR = [d_DR[name] for name in df_20newsgroup['DR'].tolist()]
df_20newsgroup_short['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_20newsgroup['TM'].tolist()]
df_20newsgroup_short['TM'] = new_names_TM
df_20newsgroup_short.head(10)

Unnamed: 0,Dataset,accuracy,DR,TM
0,20 Newsgroup,0.34,MDS,"(BERT,X,X)"
1,20 Newsgroup,0.34,MDS,"(BERT,X,X)"
2,20 Newsgroup,0.32,MDS,"(BERT,X,X)"
3,20 Newsgroup,0.32,MDS,"(BERT,X,X)"
4,20 Newsgroup,0.34,MDS,"(BERT,X,X)"
5,20 Newsgroup,0.34,MDS,"(BERT,X,X)"
6,20 Newsgroup,0.34,MDS,"(BERT,X,X)"
7,20 Newsgroup,0.32,MDS,"(BERT,X,X)"
8,20 Newsgroup,0.35,MDS,"(BERT,X,X)"
9,20 Newsgroup,0.34,MDS,"(BERT,X,X)"


In [8]:
df_20newsgroup_short_perception = df_20newsgroup.loc[:, ['Dataset', 'perception']]
new_names_DR = [d_DR[name] for name in df_20newsgroup['DR'].tolist()]
df_20newsgroup_short_perception['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_20newsgroup['TM'].tolist()]
df_20newsgroup_short_perception['TM'] = new_names_TM
df_20newsgroup_short_perception.head(10)

Unnamed: 0,Dataset,perception,DR,TM
0,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
1,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
2,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
3,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
4,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
5,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
6,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
7,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
8,20 Newsgroup,0.38,MDS,"(BERT,X,X)"
9,20 Newsgroup,0.38,MDS,"(BERT,X,X)"


In [9]:
# reading emails
df_emails = pd.read_csv('data/full_res_emails.csv')
df_emails['Calinski-Harabasz-Index Normalized'] = df_emails['Calinski-Harabasz-Index']/df_emails['Calinski-Harabasz-Index'].max()
df_emails['Davies-Bouldin-Index Normalized'] = df_emails['Davies-Bouldin-Index']/df_emails['Davies-Bouldin-Index'].max()

df_emails['accuracy'] = compute_accuracy(df_emails)
df_emails['perception'] = compute_perception(df_emails)
df_emails.head(10)

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 720}},mds,bert,0.000371,0.220254,0.45,0.38
1,emails_bert_n_categories_4_mds,0.50828,0.509272,-0.012678,2.14449,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 580}},mds,bert,0.000371,0.220254,0.47,0.38
2,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 780}},mds,bert,0.000371,0.220254,0.45,0.38
3,emails_bert_n_categories_4_mds,0.50828,0.509272,-0.012678,2.14449,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 360}},mds,bert,0.000371,0.220254,0.47,0.38
4,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 640}},mds,bert,0.000371,0.220254,0.45,0.38
5,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 840}},mds,bert,0.000371,0.220254,0.45,0.38
6,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 680}},mds,bert,0.000371,0.220254,0.45,0.38
7,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 540}},mds,bert,0.000371,0.220254,0.45,0.38
8,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 440}},mds,bert,0.000371,0.220254,0.45,0.38
9,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 520}},mds,bert,0.000371,0.220254,0.45,0.38


In [10]:
df_emails['Dataset'] = 'Emails'
df_emails_short = df_emails.loc[:, ['Dataset', 'accuracy']]
new_names_DR = [d_DR[name] for name in df_emails['DR'].tolist()]
df_emails_short['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_emails['TM'].tolist()]
df_emails_short['TM'] = new_names_TM
df_emails_short.head(10)

Unnamed: 0,Dataset,accuracy,DR,TM
0,Emails,0.45,MDS,"(BERT,X,X)"
1,Emails,0.47,MDS,"(BERT,X,X)"
2,Emails,0.45,MDS,"(BERT,X,X)"
3,Emails,0.47,MDS,"(BERT,X,X)"
4,Emails,0.45,MDS,"(BERT,X,X)"
5,Emails,0.45,MDS,"(BERT,X,X)"
6,Emails,0.45,MDS,"(BERT,X,X)"
7,Emails,0.45,MDS,"(BERT,X,X)"
8,Emails,0.45,MDS,"(BERT,X,X)"
9,Emails,0.45,MDS,"(BERT,X,X)"


In [11]:
df_emails_short_perception = df_emails.loc[:, ['Dataset', 'perception']]
new_names_DR = [d_DR[name] for name in df_emails['DR'].tolist()]
df_emails_short_perception['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_emails['TM'].tolist()]
df_emails_short_perception['TM'] = new_names_TM
df_emails_short_perception.head(10)

Unnamed: 0,Dataset,perception,DR,TM
0,Emails,0.38,MDS,"(BERT,X,X)"
1,Emails,0.38,MDS,"(BERT,X,X)"
2,Emails,0.38,MDS,"(BERT,X,X)"
3,Emails,0.38,MDS,"(BERT,X,X)"
4,Emails,0.38,MDS,"(BERT,X,X)"
5,Emails,0.38,MDS,"(BERT,X,X)"
6,Emails,0.38,MDS,"(BERT,X,X)"
7,Emails,0.38,MDS,"(BERT,X,X)"
8,Emails,0.38,MDS,"(BERT,X,X)"
9,Emails,0.38,MDS,"(BERT,X,X)"


In [12]:
# reading github
df_github = pd.read_csv('data/full_res_github_projects.csv')
df_github['Calinski-Harabasz-Index Normalized'] = df_github['Calinski-Harabasz-Index']/df_github['Calinski-Harabasz-Index'].max()
df_github['Davies-Bouldin-Index Normalized'] = df_github['Davies-Bouldin-Index']/df_github['Davies-Bouldin-Index'].max()

df_github['accuracy'] = compute_accuracy(df_github)
df_github['perception'] = compute_perception(df_github)
df_github.head(10)

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 440}},mds,bow,0.001065,0.031336,0.3,0.42
1,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 620}},mds,bow,0.001065,0.031336,0.3,0.42
2,github_projects_bow_mds,0.491366,0.491266,-0.052139,0.915395,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 400}},mds,bow,0.001065,0.031336,0.36,0.42
3,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 460}},mds,bow,0.001065,0.031336,0.3,0.42
4,github_projects_bow_mds,0.491366,0.491266,-0.052139,0.915395,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 320}},mds,bow,0.001065,0.031336,0.36,0.42
5,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 840}},mds,bow,0.001065,0.031336,0.3,0.42
6,github_projects_bow_mds,0.491366,0.491266,-0.052139,0.915395,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 300}},mds,bow,0.001065,0.031336,0.36,0.42
7,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 340}},mds,bow,0.001065,0.031336,0.3,0.42
8,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 700}},mds,bow,0.001065,0.031336,0.3,0.42
9,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 820}},mds,bow,0.001065,0.031336,0.3,0.42


In [13]:
df_github['Dataset'] = 'GitHub'
df_github_short = df_github.loc[:, ['Dataset', 'accuracy']]
new_names_DR = [d_DR[name] for name in df_github['DR'].tolist()]
df_github_short['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_github['TM'].tolist()]
df_github_short['TM'] = new_names_TM
df_github_short.head(10)

Unnamed: 0,Dataset,accuracy,DR,TM
0,GitHub,0.3,MDS,"(VSM,-,X)"
1,GitHub,0.3,MDS,"(VSM,-,X)"
2,GitHub,0.36,MDS,"(VSM,-,X)"
3,GitHub,0.3,MDS,"(VSM,-,X)"
4,GitHub,0.36,MDS,"(VSM,-,X)"
5,GitHub,0.3,MDS,"(VSM,-,X)"
6,GitHub,0.36,MDS,"(VSM,-,X)"
7,GitHub,0.3,MDS,"(VSM,-,X)"
8,GitHub,0.3,MDS,"(VSM,-,X)"
9,GitHub,0.3,MDS,"(VSM,-,X)"


In [14]:
df_github_short_perception = df_github.loc[:, ['Dataset', 'perception']]
new_names_DR = [d_DR[name] for name in df_github['DR'].tolist()]
df_github_short_perception['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_github['TM'].tolist()]
df_github_short_perception['TM'] = new_names_TM
df_github_short_perception.head(10)

Unnamed: 0,Dataset,perception,DR,TM
0,GitHub,0.42,MDS,"(VSM,-,X)"
1,GitHub,0.42,MDS,"(VSM,-,X)"
2,GitHub,0.42,MDS,"(VSM,-,X)"
3,GitHub,0.42,MDS,"(VSM,-,X)"
4,GitHub,0.42,MDS,"(VSM,-,X)"
5,GitHub,0.42,MDS,"(VSM,-,X)"
6,GitHub,0.42,MDS,"(VSM,-,X)"
7,GitHub,0.42,MDS,"(VSM,-,X)"
8,GitHub,0.42,MDS,"(VSM,-,X)"
9,GitHub,0.42,MDS,"(VSM,-,X)"


In [15]:
# reading reuters
df_reuters = pd.read_csv('data/full_res_reuters.csv')
df_reuters['Calinski-Harabasz-Index Normalized'] = df_reuters['Calinski-Harabasz-Index']/df_reuters['Calinski-Harabasz-Index'].max()
df_reuters['Davies-Bouldin-Index Normalized'] = df_reuters['Davies-Bouldin-Index']/df_reuters['Davies-Bouldin-Index'].max()

df_reuters['accuracy'] = compute_accuracy(df_reuters)
df_reuters['perception'] = compute_perception(df_reuters)
df_reuters.head(10)

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.358584,0.802779,-0.670315,74.591222,1.468805,0.002741,{'mds': {'max_iter': 560}},mds,bert,0.349456,3.1e-05,0.43,0.48
1,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.361825,0.931937,-0.668009,72.290487,1.418416,0.00285,{'mds': {'max_iter': 760}},mds,bert,0.40568,3e-05,0.43,0.5
2,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.358834,0.920033,-0.688146,103.000758,1.415418,0.003508,{'mds': {'max_iter': 800}},mds,bert,0.400497,4.2e-05,0.43,0.49
3,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.36076,1.044194,-0.628188,87.678857,1.35749,0.003398,{'mds': {'max_iter': 660}},mds,bert,0.454546,3.6e-05,0.43,0.52
4,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.356845,1.018096,-0.668368,71.485715,1.503931,0.002302,{'mds': {'max_iter': 820}},mds,bert,0.443185,2.9e-05,0.43,0.51
5,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.361575,1.05332,-0.681794,91.809109,1.425284,0.003508,{'mds': {'max_iter': 700}},mds,bert,0.458518,3.8e-05,0.43,0.51
6,reuters_bert_n_categories_65_mds,0.55771,0.517704,-0.282974,0.415034,0.359476,1.170584,-0.650773,65.771922,1.522557,0.00296,{'mds': {'max_iter': 640}},mds,bert,0.509564,2.7e-05,0.42,0.53
7,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.360056,1.032439,-0.682262,53.780608,1.557365,0.003947,{'mds': {'max_iter': 620}},mds,bert,0.449429,2.2e-05,0.43,0.51
8,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.360854,0.987751,-0.690038,60.255208,1.455554,0.002631,{'mds': {'max_iter': 480}},mds,bert,0.429976,2.5e-05,0.43,0.5
9,reuters_bert_n_categories_65_mds,0.55771,0.517704,-0.282974,0.415034,0.35896,0.994616,-0.704999,60.477665,1.377222,0.003398,{'mds': {'max_iter': 360}},mds,bert,0.432964,2.5e-05,0.42,0.5


In [16]:
df_reuters['Dataset'] = 'Reuters'
df_reuters_short = df_reuters.loc[:, ['Dataset', 'accuracy']]
new_names_DR = [d_DR[name] for name in df_reuters['DR'].tolist()]
df_reuters_short['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_reuters['TM'].tolist()]
df_reuters_short['TM'] = new_names_TM
df_reuters_short.head(10)

Unnamed: 0,Dataset,accuracy,DR,TM
0,Reuters,0.43,MDS,"(BERT,X,X)"
1,Reuters,0.43,MDS,"(BERT,X,X)"
2,Reuters,0.43,MDS,"(BERT,X,X)"
3,Reuters,0.43,MDS,"(BERT,X,X)"
4,Reuters,0.43,MDS,"(BERT,X,X)"
5,Reuters,0.43,MDS,"(BERT,X,X)"
6,Reuters,0.42,MDS,"(BERT,X,X)"
7,Reuters,0.43,MDS,"(BERT,X,X)"
8,Reuters,0.43,MDS,"(BERT,X,X)"
9,Reuters,0.42,MDS,"(BERT,X,X)"


In [17]:
df_reuters_short_perception = df_reuters.loc[:, ['Dataset', 'perception']]
new_names_DR = [d_DR[name] for name in df_reuters['DR'].tolist()]
df_reuters_short_perception['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_reuters['TM'].tolist()]
df_reuters_short_perception['TM'] = new_names_TM
df_reuters_short_perception.head(10)

Unnamed: 0,Dataset,perception,DR,TM
0,Reuters,0.48,MDS,"(BERT,X,X)"
1,Reuters,0.5,MDS,"(BERT,X,X)"
2,Reuters,0.49,MDS,"(BERT,X,X)"
3,Reuters,0.52,MDS,"(BERT,X,X)"
4,Reuters,0.51,MDS,"(BERT,X,X)"
5,Reuters,0.51,MDS,"(BERT,X,X)"
6,Reuters,0.53,MDS,"(BERT,X,X)"
7,Reuters,0.51,MDS,"(BERT,X,X)"
8,Reuters,0.5,MDS,"(BERT,X,X)"
9,Reuters,0.5,MDS,"(BERT,X,X)"


In [18]:
# reading 7 Categories
df_7categories = pd.read_csv('data/full_res_seven_categories.csv')
df_7categories['Calinski-Harabasz-Index Normalized'] = df_7categories['Calinski-Harabasz-Index']/df_7categories['Calinski-Harabasz-Index'].max()
df_7categories['Davies-Bouldin-Index Normalized'] = df_7categories['Davies-Bouldin-Index']/df_7categories['Davies-Bouldin-Index'].max()

df_7categories['accuracy'] = compute_accuracy(df_7categories)
df_7categories['perception'] = compute_perception(df_7categories)
df_7categories.head(10)

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,seven_categories_bert_n_categories_7_mds,0.507452,0.511752,-0.035918,0.42165,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 420}},mds,bert,0.000421,0.347754,0.41,0.32
1,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 880}},mds,bert,0.000421,0.347754,0.39,0.32
2,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 800}},mds,bert,0.000421,0.347754,0.39,0.32
3,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 500}},mds,bert,0.000421,0.347754,0.39,0.32
4,seven_categories_bert_n_categories_7_mds,0.507452,0.511752,-0.035918,0.42165,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 700}},mds,bert,0.000421,0.347754,0.41,0.32
5,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 340}},mds,bert,0.000421,0.347754,0.39,0.32
6,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 440}},mds,bert,0.000421,0.347754,0.39,0.32
7,seven_categories_bert_n_categories_7_mds,0.507452,0.511752,-0.035918,0.42165,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 840}},mds,bert,0.000421,0.347754,0.41,0.32
8,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 360}},mds,bert,0.000421,0.347754,0.39,0.32
9,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 680}},mds,bert,0.000421,0.347754,0.39,0.32


In [19]:
df_7categories['Dataset'] = '7 Categories'
df_7categories_short = df_7categories.loc[:, ['Dataset', 'accuracy']]
new_names_DR = [d_DR[name] for name in df_7categories['DR'].tolist()]
df_7categories_short['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_7categories['TM'].tolist()]
df_7categories_short['TM'] = new_names_TM
df_7categories_short.head(10)

Unnamed: 0,Dataset,accuracy,DR,TM
0,7 Categories,0.41,MDS,"(BERT,X,X)"
1,7 Categories,0.39,MDS,"(BERT,X,X)"
2,7 Categories,0.39,MDS,"(BERT,X,X)"
3,7 Categories,0.39,MDS,"(BERT,X,X)"
4,7 Categories,0.41,MDS,"(BERT,X,X)"
5,7 Categories,0.39,MDS,"(BERT,X,X)"
6,7 Categories,0.39,MDS,"(BERT,X,X)"
7,7 Categories,0.41,MDS,"(BERT,X,X)"
8,7 Categories,0.39,MDS,"(BERT,X,X)"
9,7 Categories,0.39,MDS,"(BERT,X,X)"


In [20]:
df_7categories_short_perception = df_7categories.loc[:, ['Dataset', 'perception']]
new_names_DR = [d_DR[name] for name in df_7categories['DR'].tolist()]
df_7categories_short_perception['DR'] = new_names_DR
new_names_TM = [d[name] for name in df_7categories['TM'].tolist()]
df_7categories_short_perception['TM'] = new_names_TM
df_7categories_short_perception.head(10)

Unnamed: 0,Dataset,perception,DR,TM
0,7 Categories,0.32,MDS,"(BERT,X,X)"
1,7 Categories,0.32,MDS,"(BERT,X,X)"
2,7 Categories,0.32,MDS,"(BERT,X,X)"
3,7 Categories,0.32,MDS,"(BERT,X,X)"
4,7 Categories,0.32,MDS,"(BERT,X,X)"
5,7 Categories,0.32,MDS,"(BERT,X,X)"
6,7 Categories,0.32,MDS,"(BERT,X,X)"
7,7 Categories,0.32,MDS,"(BERT,X,X)"
8,7 Categories,0.32,MDS,"(BERT,X,X)"
9,7 Categories,0.32,MDS,"(BERT,X,X)"


## Analysis of Accuracy

In [21]:
def default_accuracy(df, tm, dr):
    TM = inverted_d[tm]
    DR = inverted_d_DR[dr]
    if DR == "mds":
        default_hyp = "'mds': {'max_iter': 300}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_accuracy = df_selected["accuracy"].tolist()[0]
        return default_accuracy
    elif DR == "tsne": 
        default_hyp = "'tsne': {'perplexity': 30.0, 'early_exaggeration': 12.0, 'learning_rate': 'auto', 'n_iter': 1000, 'angle': 0.5}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_accuracy = df_selected["accuracy"].tolist()[0]
        return default_accuracy
    elif DR == "umap":
        default_hyp = "'umap': {'n_neighbors': 15, 'min_dist': 0.1, 'metric': 'cosine', 'spread': 1.0, 'set_op_mix_ratio': 1.0, 'local_connectivity': 1, 'repulsion_strength': 1.0, 'negative_sample_rate': 5}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_accuracy = df_selected["accuracy"].tolist()[0]
        return default_accuracy

In [22]:
df_results_accuracy = pd.DataFrame(columns=["TM", "DR", "value"])
for tm in set(df_20newsgroup_short["TM"].tolist()):
    for dr in ["MDS", "UMAP", "TSNE"]:
        i = 0
        j = 0
        for df in [df_20newsgroup, df_emails, df_github, df_reuters, df_7categories]:
            greater_than_default = 0
            try:
                default_acc = default_accuracy(df, tm, dr)
                if i == 0:
                    df_short = df_20newsgroup_short
                if i == 1:
                    df_short = df_emails_short
                if i == 2:
                    df_short = df_github_short
                if i == 3:
                    df_short = df_reuters_short
                if i == 4:
                        df_short = df_7categories_short
                
                df_short_selected = df_short[(df_short["TM"] == tm) & (df_short["DR"] == dr)]
                list_accuracy = df_short_selected["accuracy"].tolist()
                greater = [i for i in list_accuracy if i > default_acc]
                greater_than_default += len(greater)/len(list_accuracy)
                i += 1
                j += 1

            except:
                #print(tm, dr, "XXX")
                i += 1
        
        greater_than_default = round(1 - (greater_than_default/j),2)
        new_row = {"TM": tm, "DR": dr, "value": greater_than_default}
        df_results_accuracy = df_results_accuracy.append(new_row, ignore_index = True)
        print(tm, dr, greater_than_default)

(LSI,+,+) MDS 1.0
(LSI,+,+) UMAP 0.9
(LSI,+,+) TSNE 0.92
(LSI,+,-) MDS 0.9
(LSI,+,-) UMAP 0.9
(LSI,+,-) TSNE 0.93
(VSM,-,X) MDS 1.0
(VSM,-,X) UMAP 0.99
(VSM,-,X) TSNE 0.94
(NMF,-,+) MDS 1.0
(NMF,-,+) UMAP 0.88
(NMF,-,+) TSNE 0.9
(LSI,-,-) MDS 1.0
(LSI,-,-) UMAP 0.9
(LSI,-,-) TSNE 0.93
(LDA,X,+) MDS 1.0
(LDA,X,+) UMAP 0.89
(LDA,X,+) TSNE 0.89
(NMF,-,-) MDS 1.0
(NMF,-,-) UMAP 0.89
(NMF,-,-) TSNE 0.87
(LSI,-,+) MDS 1.0
(LSI,-,+) UMAP 0.9
(LSI,-,+) TSNE 0.93
(BERT,X,X) MDS 1.0
(BERT,X,X) UMAP 0.88
(BERT,X,X) TSNE 0.92
(NMF,+,+) MDS 1.0
(NMF,+,+) UMAP 1.0
(NMF,+,+) TSNE 1.0
(NMF,+,-) MDS 1.0
(NMF,+,-) UMAP 1.0
(NMF,+,-) TSNE 1.0
(VSM,+,X) MDS 1.0
(VSM,+,X) UMAP 0.98
(VSM,+,X) TSNE 0.96
(LDA,X,-) MDS 1.0
(LDA,X,-) UMAP 0.88
(LDA,X,-) TSNE 0.88


In [23]:
df_results_accuracy.head(10)

Unnamed: 0,TM,DR,value
0,"(LSI,+,+)",MDS,1.0
1,"(LSI,+,+)",UMAP,0.9
2,"(LSI,+,+)",TSNE,0.92
3,"(LSI,+,-)",MDS,0.9
4,"(LSI,+,-)",UMAP,0.9
5,"(LSI,+,-)",TSNE,0.93
6,"(VSM,-,X)",MDS,1.0
7,"(VSM,-,X)",UMAP,0.99
8,"(VSM,-,X)",TSNE,0.94
9,"(NMF,-,+)",MDS,1.0


In [24]:
df_results_accuracy["value"] = 1 - df_results_accuracy["value"]
df_results_accuracy.head(10)

Unnamed: 0,TM,DR,value
0,"(LSI,+,+)",MDS,0.0
1,"(LSI,+,+)",UMAP,0.1
2,"(LSI,+,+)",TSNE,0.08
3,"(LSI,+,-)",MDS,0.1
4,"(LSI,+,-)",UMAP,0.1
5,"(LSI,+,-)",TSNE,0.07
6,"(VSM,-,X)",MDS,0.0
7,"(VSM,-,X)",UMAP,0.01
8,"(VSM,-,X)",TSNE,0.06
9,"(NMF,-,+)",MDS,0.0


In [25]:
df_results_accuracy.to_csv("Results_DefaultValues_Accuracy_Final.csv")

## Analysis of Perception

In [26]:
def default_perception(df, tm, dr):
    TM = inverted_d[tm]
    DR = inverted_d_DR[dr]
    if DR == "mds":
        default_hyp = "'mds': {'max_iter': 300}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_perception = df_selected["perception"].tolist()[0]
        return default_perception
    elif DR == "tsne": 
        default_hyp = "'tsne': {'perplexity': 30.0, 'early_exaggeration': 12.0, 'learning_rate': 'auto', 'n_iter': 1000, 'angle': 0.5}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_perception = df_selected["perception"].tolist()[0]
        return default_perception
    elif DR == "umap":
        default_hyp = "'umap': {'n_neighbors': 15, 'min_dist': 0.1, 'metric': 'cosine', 'spread': 1.0, 'set_op_mix_ratio': 1.0, 'local_connectivity': 1, 'repulsion_strength': 1.0, 'negative_sample_rate': 5}"
        df_selected = df[(df["TM"] == TM) & (df["DR"] == DR) & (df['Complete List of Hyperparameters'].str.contains(default_hyp))]
        default_perception = df_selected["perception"].tolist()[0]
        return default_perception

In [27]:
df_results_perception = pd.DataFrame(columns=["TM", "DR", "value"])
for tm in set(df_20newsgroup_short["TM"].tolist()):
    for dr in ["MDS", "UMAP", "TSNE"]:
        i = 0
        j = 0
        for df in [df_20newsgroup, df_emails, df_github, df_reuters, df_7categories]:
            greater_than_default = 0
            try:
                default_per = default_perception(df, tm, dr)
                if i == 0:
                    df_short = df_20newsgroup_short_perception
                if i == 1:
                    df_short = df_emails_short_perception
                if i == 2:
                    df_short = df_github_short_perception
                if i == 3:
                    df_short = df_reuters_short_perception
                if i == 4:
                        df_short = df_7categories_short_perception
                
                df_short_selected = df_short[(df_short["TM"] == tm) & (df_short["DR"] == dr)]
                list_perception = df_short_selected["perception"].tolist()
                greater = [i for i in list_perception if i > default_per]
                greater_than_default += len(greater)/len(list_perception)
                i += 1
                j += 1

            except:
                #print(tm, dr, "XXX")
                i += 1
        
        greater_than_default = round(1 - (greater_than_default/j),2)
        new_row = {"TM": tm, "DR": dr, "value": greater_than_default}
        df_results_perception = df_results_perception.append(new_row, ignore_index = True)
        print(tm, dr, greater_than_default)

(LSI,+,+) MDS 1.0
(LSI,+,+) UMAP 0.95
(LSI,+,+) TSNE 0.88
(LSI,+,-) MDS 1.0
(LSI,+,-) UMAP 0.84
(LSI,+,-) TSNE 0.88
(VSM,-,X) MDS 1.0
(VSM,-,X) UMAP 0.89
(VSM,-,X) TSNE 0.93
(NMF,-,+) MDS 1.0
(NMF,-,+) UMAP 1.0
(NMF,-,+) TSNE 0.92
(LSI,-,-) MDS 1.0
(LSI,-,-) UMAP 0.99
(LSI,-,-) TSNE 0.93
(LDA,X,+) MDS 1.0
(LDA,X,+) UMAP 0.96
(LDA,X,+) TSNE 0.88
(NMF,-,-) MDS 1.0
(NMF,-,-) UMAP 1.0
(NMF,-,-) TSNE 0.88
(LSI,-,+) MDS 1.0
(LSI,-,+) UMAP 1.0
(LSI,-,+) TSNE 0.93
(BERT,X,X) MDS 1.0
(BERT,X,X) UMAP 0.87
(BERT,X,X) TSNE 0.85
(NMF,+,+) MDS 1.0
(NMF,+,+) UMAP 0.98
(NMF,+,+) TSNE 0.91
(NMF,+,-) MDS 1.0
(NMF,+,-) UMAP 0.98
(NMF,+,-) TSNE 0.93
(VSM,+,X) MDS 1.0
(VSM,+,X) UMAP 0.88
(VSM,+,X) TSNE 0.92
(LDA,X,-) MDS 1.0
(LDA,X,-) UMAP 0.84
(LDA,X,-) TSNE 0.97


In [28]:
df_results_perception["value"] = 1 - df_results_perception["value"]
df_results_perception.head(10)

Unnamed: 0,TM,DR,value
0,"(LSI,+,+)",MDS,0.0
1,"(LSI,+,+)",UMAP,0.05
2,"(LSI,+,+)",TSNE,0.12
3,"(LSI,+,-)",MDS,0.0
4,"(LSI,+,-)",UMAP,0.16
5,"(LSI,+,-)",TSNE,0.12
6,"(VSM,-,X)",MDS,0.0
7,"(VSM,-,X)",UMAP,0.11
8,"(VSM,-,X)",TSNE,0.07
9,"(NMF,-,+)",MDS,0.0


In [29]:
df_results_perception.to_csv("Results_DefaultValue_Perception_Final.csv")