In [1]:
import pandas as pd
import numpy as np

In [2]:
def compute_accuracy(df):
    trustworthiness = df['Trustworthiness'].tolist()
    continuity = df['Continuity'].tolist()
    neighborhood_hit = df['7-Neighborhood Hit'].tolist()
    shephard_diagram_correlation = df['Shephard Diagram Correlation'].tolist()
    
    result = []
    for i in range(len(trustworthiness)):
        result.append(round(trustworthiness[i]/6 + continuity[i]/6 + neighborhood_hit[i]/2 + 0.5*(shephard_diagram_correlation[i]+1)/6,2))
    
    return result

In [3]:
def compute_perception(df):
    distance_consistency = df['Distance consistency'].tolist()
    silhouette_coefficient = df['Silhouette coefficient'].tolist()
    calinski_harabasz = df['Calinski-Harabasz-Index Normalized'].tolist()
    davies_bouldin = df['Davies-Bouldin-Index Normalized']
    
    result = []
    for i in range(len(distance_consistency)):
        result.append(round(distance_consistency[i]/6 + 0.5*(silhouette_coefficient[i] + 1)/6 +
                            (calinski_harabasz[i])/3 + (1-davies_bouldin[i])/3,2))
    
    return result

In [4]:
df_20newsgroup = pd.read_csv('data/full_res_20_newsgroups.csv')
df_20newsgroup['Calinski-Harabasz-Index Normalized'] = df_20newsgroup['Calinski-Harabasz-Index']/df_20newsgroup['Calinski-Harabasz-Index'].max()
df_20newsgroup['Davies-Bouldin-Index Normalized'] = df_20newsgroup['Davies-Bouldin-Index']/df_20newsgroup['Davies-Bouldin-Index'].max()

df_20newsgroup['accuracy'] = compute_accuracy(df_20newsgroup)
df_20newsgroup['perception'] = compute_perception(df_20newsgroup)
df_20newsgroup.head()

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 800}},mds,bert,0.000457,0.118467,0.34,0.38
1,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 340}},mds,bert,0.000457,0.118467,0.34,0.38
2,20_newsgroups_bert_n_categories_20_mds,0.531983,0.472036,-0.304442,1.097868,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 680}},mds,bert,0.000457,0.118467,0.32,0.38
3,20_newsgroups_bert_n_categories_20_mds,0.531983,0.472036,-0.304442,1.097868,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 320}},mds,bert,0.000457,0.118467,0.32,0.38
4,20_newsgroups_bert_n_categories_20_mds,0.50379,0.50751,-0.024379,0.388811,0.187202,1.172136,-0.065829,163.258344,1.847043,0.054357,{'mds': {'max_iter': 500}},mds,bert,0.000457,0.118467,0.34,0.38


In [5]:
# reading emails
df_emails = pd.read_csv('data/full_res_emails.csv')
df_emails['Calinski-Harabasz-Index Normalized'] = df_emails['Calinski-Harabasz-Index']/df_emails['Calinski-Harabasz-Index'].max()
df_emails['Davies-Bouldin-Index Normalized'] = df_emails['Davies-Bouldin-Index']/df_emails['Davies-Bouldin-Index'].max()

df_emails['accuracy'] = compute_accuracy(df_emails)
df_emails['perception'] = compute_perception(df_emails)
df_emails.head()

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 720}},mds,bert,0.000371,0.220254,0.45,0.38
1,emails_bert_n_categories_4_mds,0.50828,0.509272,-0.012678,2.14449,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 580}},mds,bert,0.000371,0.220254,0.47,0.38
2,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 780}},mds,bert,0.000371,0.220254,0.45,0.38
3,emails_bert_n_categories_4_mds,0.50828,0.509272,-0.012678,2.14449,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 360}},mds,bert,0.000371,0.220254,0.47,0.38
4,emails_bert_n_categories_4_mds,0.543397,0.498714,-0.313603,1.178286,0.429591,1.016994,-0.037617,268.961517,1.885956,0.221381,{'mds': {'max_iter': 640}},mds,bert,0.000371,0.220254,0.45,0.38


In [6]:
# reading github projects
df_github = pd.read_csv('data/full_res_github_projects.csv')
df_github['Calinski-Harabasz-Index Normalized'] = df_github['Calinski-Harabasz-Index']/df_github['Calinski-Harabasz-Index'].max()
df_github['Davies-Bouldin-Index Normalized'] = df_github['Davies-Bouldin-Index']/df_github['Davies-Bouldin-Index'].max()

df_github['accuracy'] = compute_accuracy(df_github)
df_github['perception'] = compute_perception(df_github)
df_github.head()

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 440}},mds,bow,0.001065,0.031336,0.3,0.42
1,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 620}},mds,bow,0.001065,0.031336,0.3,0.42
2,github_projects_bow_mds,0.491366,0.491266,-0.052139,0.915395,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 400}},mds,bow,0.001065,0.031336,0.36,0.42
3,github_projects_bow_mds,0.481415,0.362911,-0.536036,0.653439,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 460}},mds,bow,0.001065,0.031336,0.3,0.42
4,github_projects_bow_mds,0.491366,0.491266,-0.052139,0.915395,0.24021,0.570655,-0.130839,34.097321,2.232234,0.169985,{'mds': {'max_iter': 320}},mds,bow,0.001065,0.031336,0.36,0.42


In [7]:
# reading reuters
df_reuters = pd.read_csv('data/full_res_reuters.csv')
df_reuters['Calinski-Harabasz-Index Normalized'] = df_reuters['Calinski-Harabasz-Index']/df_reuters['Calinski-Harabasz-Index'].max()
df_reuters['Davies-Bouldin-Index Normalized'] = df_reuters['Davies-Bouldin-Index']/df_reuters['Davies-Bouldin-Index'].max()

df_reuters['accuracy'] = compute_accuracy(df_reuters)
df_reuters['perception'] = compute_perception(df_reuters)
df_reuters.head()

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.358584,0.802779,-0.670315,74.591222,1.468805,0.002741,{'mds': {'max_iter': 560}},mds,bert,0.349456,3.1e-05,0.43,0.48
1,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.361825,0.931937,-0.668009,72.290487,1.418416,0.00285,{'mds': {'max_iter': 760}},mds,bert,0.40568,3e-05,0.43,0.5
2,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.358834,0.920033,-0.688146,103.000758,1.415418,0.003508,{'mds': {'max_iter': 800}},mds,bert,0.400497,4.2e-05,0.43,0.49
3,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.36076,1.044194,-0.628188,87.678857,1.35749,0.003398,{'mds': {'max_iter': 660}},mds,bert,0.454546,3.6e-05,0.43,0.52
4,reuters_bert_n_categories_65_mds,0.506105,0.536519,-0.098968,0.263758,0.356845,1.018096,-0.668368,71.485715,1.503931,0.002302,{'mds': {'max_iter': 820}},mds,bert,0.443185,2.9e-05,0.43,0.51


In [8]:
# reading seven categories
df_7categories = pd.read_csv('data/full_res_seven_categories.csv')
df_7categories['Calinski-Harabasz-Index Normalized'] = df_7categories['Calinski-Harabasz-Index']/df_7categories['Calinski-Harabasz-Index'].max()
df_7categories['Davies-Bouldin-Index Normalized'] = df_7categories['Davies-Bouldin-Index']/df_7categories['Davies-Bouldin-Index'].max()

df_7categories['accuracy'] = compute_accuracy(df_7categories)
df_7categories['perception'] = compute_perception(df_7categories)
df_7categories.head()

Unnamed: 0,Experiment,Trustworthiness,Continuity,Shephard Diagram Correlation,Normalized Stress,7-Neighborhood Hit,Calinski-Harabasz-Index,Silhouette coefficient,Davies-Bouldin-Index,SDBW validity index,Distance consistency,Complete List of Hyperparameters,DR,TM,Calinski-Harabasz-Index Normalized,Davies-Bouldin-Index Normalized,accuracy,perception
0,seven_categories_bert_n_categories_7_mds,0.507452,0.511752,-0.035918,0.42165,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 420}},mds,bert,0.000421,0.347754,0.41,0.32
1,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 880}},mds,bert,0.000421,0.347754,0.39,0.32
2,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 800}},mds,bert,0.000421,0.347754,0.39,0.32
3,seven_categories_bert_n_categories_7_mds,0.514772,0.500909,-0.245096,0.635536,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 500}},mds,bert,0.000421,0.347754,0.39,0.32
4,seven_categories_bert_n_categories_7_mds,0.507452,0.511752,-0.035918,0.42165,0.319567,0.504025,-0.065309,193.244725,1.696309,0.135273,{'mds': {'max_iter': 700}},mds,bert,0.000421,0.347754,0.41,0.32


### Selecting the Data for Accuracy

In [9]:
d = {'bow_umap': '(VSM,-,UMAP,X)', 
     'bow_mds': '(VSM,-,MDS,X)',
     'bow_som': '(VSM,-,SOM,X)',
     'bow_tsne': '(VSM,-,TSNE,X)',
     'tfidf_umap': '(VSM,+,UMAP,X)',
     'tfidf_mds': '(VSM,+,MDS,X)',
     'tfidf_som': '(VSM,+,SOM,X)',
     'tfidf_tsne': '(VSM,+,TSNE,X)', 
     'lsi_umap': '(LSI,-,UMAP,-)',
     'lsi_mds': '(LSI,-,MDS,-)',
     'lsi_som': '(LSI,-,SOM,-)',
     'lsi_tsne': '(LSI,-,TSNE,-)',
     'lsi_tfidf_umap': '(LSI,+,UMAP,-)',
     'lsi_tfidf_mds': '(LSI,+,MDS,-)',
     'lsi_tfidf_som': '(LSI,+,SOM,-)',
     'lsi_tfidf_tsne': '(LSI,+,TSNE,-)',
     'lsi_linear_combined_umap': '(LSI,-,UMAP,+)',
     'lsi_linear_combined_mds': '(LSI,-,MDS,+)',
     'lsi_linear_combined_som': '(LSI,-,SOM,+)',
     'lsi_linear_combined_tsne': '(LSI,-,TSNE,+)',
     'lsi_tfidf_linear_combined_umap': '(LSI,+,UMAP,+)',
     'lsi_tfidf_linear_combined_mds': '(LSI,+,MDS,+)',
     'lsi_tfidf_linear_combined_som': '(LSI,+,SOM,+)',
     'lsi_tfidf_linear_combined_tsne': '(LSI,+,TSNE,+)',
     'nmf_umap': '(NMF,-,UMAP,-)',
     'nmf_mds': '(NMF,-,MDS,-)',
     'nmf_som': '(NMF,-,SOM,-)',
     'nmf_tsne': '(NMF,-,TSNE,-)',
     'nmf_tfidf_umap': '(NMF,+,UMAP,-)',
     'nmf_tfidf_mds': '(NMF,+,MDS,-)',
     'nmf_tfidf_som': '(NMF,+,SOM,-)',
     'nmf_tfidf_tsne': '(NMF,+,TSNE,-)',
     'nmf_linear_combined_umap': '(NMF,-,UMAP,+)',
     'nmf_linear_combined_mds': '(NMF,-,MDS,+)',
     'nmf_linear_combined_som': '(NMF,-,SOM,+)',
     'nmf_linear_combined_tsne': '(NMF,-,TSNE,+)',
     'nmf_tfidf_linear_combined_umap': '(NMF,+,UMAP,+)',
     'nmf_tfidf_linear_combined_mds': '(NMF,+,MDS,+)',
     'nmf_tfidf_linear_combined_som': '(NMF,+,SOM,+)',
     'nmf_tfidf_linear_combined_tsne': '(NMF,+,TSNE,+)',
     'lda_umap': '(LDA,X,UMAP,-)',
     'lda_mds': '(LDA,X,MDS,-)',
     'lda_som': '(LDA,X,SOM,-)',
     'lda_tsne': '(LDA,X,TSNE,-)',
     'lda_linear_combined_umap': '(LDA,X,UMAP,+)',
     'lda_linear_combined_mds': '(LDA,X,MDS,+)',
     'lda_linear_combined_som': '(LDA,X,SOM,+)',
     'lda_linear_combined_tsne': '(LDA,X,TSNE,+)',
     'bert_umap': '(BERT,X,UMAP,X)',
     'bert_mds': '(BERT,X,MDS,X)',
     'bert_som': '(BERT,X,SOM,X)',
     'bert_tsne': '(BERT,X,TSNE,X)',}

In [10]:
list_dataset = []
list_DR = []
list_TM = []
list_value_accuracy = []
list_names = []

In [11]:
df = df_20newsgroup
dataset = "20 Newsgroup"

df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["accuracy"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_accuracy.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [12]:
df = df_emails
dataset = "Emails"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["accuracy"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_accuracy.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [13]:
df = df_github
dataset = "GitHub"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["accuracy"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_accuracy.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [14]:
df = df_reuters
dataset = "Reuters"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["accuracy"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_accuracy.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [15]:
df = df_7categories
dataset = "Seven Categories"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["accuracy"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_accuracy.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [16]:
df_heatmap_accuracy = pd.DataFrame({'Dataset': list_dataset, 'Layout': list_names, 'value': list_value_accuracy})
df_heatmap_accuracy

Unnamed: 0,Dataset,Layout,value
0,20 Newsgroup,"(VSM,+,MDS,X)",0.34
1,20 Newsgroup,"(LSI,-,MDS,-)",0.34
2,20 Newsgroup,"(BERT,X,MDS,X)",0.35
3,20 Newsgroup,"(LSI,+,MDS,-)",0.33
4,20 Newsgroup,"(NMF,+,MDS,-)",0.34
...,...,...,...
249,Seven Categories,"(NMF,-,TSNE,+)",0.77
250,Seven Categories,"(LDA,X,TSNE,+)",0.73
251,Seven Categories,"(NMF,+,TSNE,+)",0.77
252,Seven Categories,"(LSI,-,TSNE,+)",0.78


In [17]:
for dataset in ['20 Newsgroup', 'Emails', 'GitHub', 'Reuters', 'Seven Categories']:
    for layout in set(list_names):
        row = df_heatmap_accuracy[(df_heatmap_accuracy['Layout'] == layout) & (df_heatmap_accuracy['Dataset'] == dataset)]
        if len(row['value'].tolist()) == 0:
            print(dataset, layout)
            new_row = {'Dataset': dataset, 'Layout': layout, 'value': -1}
            df_heatmap_accuracy = df_heatmap_accuracy.append(new_row, ignore_index = True)

GitHub (NMF,+,SOM,+)
GitHub (BERT,X,UMAP,X)
GitHub (BERT,X,MDS,X)
GitHub (LDA,X,SOM,+)
GitHub (BERT,X,TSNE,X)
GitHub (BERT,X,SOM,X)


In [18]:
df_heatmap_accuracy.to_csv("Results_Heatmap_Accuracy_Final.csv")

### Selecting the Data for Perception

In [19]:
list_dataset = []
list_DR = []
list_TM = []
list_value_perception = []
list_names = []

In [20]:
df = df_20newsgroup
dataset = "20 Newsgroup"

df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["perception"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_perception.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [21]:
df = df_emails
dataset = "Emails"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["perception"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_perception.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [22]:
df = df_github
dataset = "GitHub"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["perception"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_perception.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [23]:
df = df_reuters
dataset = "Reuters"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["perception"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_perception.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [24]:
df = df_7categories
dataset = "Seven Categories"

df.head()
df_DR = set(df["DR"].tolist())
df_TM = set(df["TM"].tolist())


for dimred in df_DR:
    for topicmod in df_TM:
        df_selected = df[(df["DR"] == dimred) & (df["TM"] == topicmod)]
        if len(df_selected.index) > 0:
            max_value = max(df_selected["perception"].tolist())
            # print(dimred, topicmod, max_value)
            list_dataset.append(dataset)
            list_DR.append(dimred)
            list_TM.append(topicmod)
            list_value_perception.append(max_value)
            list_names.append(d[topicmod + "_" + dimred])

In [25]:
df_heatmap_perception = pd.DataFrame({'Dataset': list_dataset, 'Layout': list_names, 'value': list_value_perception})
df_heatmap_perception

Unnamed: 0,Dataset,Layout,value
0,20 Newsgroup,"(VSM,+,MDS,X)",0.33
1,20 Newsgroup,"(LSI,-,MDS,-)",0.37
2,20 Newsgroup,"(BERT,X,MDS,X)",0.38
3,20 Newsgroup,"(LSI,+,MDS,-)",0.38
4,20 Newsgroup,"(NMF,+,MDS,-)",0.37
...,...,...,...
249,Seven Categories,"(NMF,-,TSNE,+)",0.71
250,Seven Categories,"(LDA,X,TSNE,+)",0.54
251,Seven Categories,"(NMF,+,TSNE,+)",0.65
252,Seven Categories,"(LSI,-,TSNE,+)",0.64


In [26]:
for dataset in ['20 Newsgroup', 'Emails', 'GitHub', 'Reuters', 'Seven Categories']:
    for layout in set(list_names):
        row = df_heatmap_perception[(df_heatmap_perception['Layout'] == layout) & (df_heatmap_perception['Dataset'] == dataset)]
        if len(row['value'].tolist()) == 0:
            print(dataset, layout)
            new_row = {'Dataset': dataset, 'Layout': layout, 'value': -1}
            df_heatmap_perception = df_heatmap_perception.append(new_row, ignore_index = True)

GitHub (NMF,+,SOM,+)
GitHub (BERT,X,UMAP,X)
GitHub (BERT,X,MDS,X)
GitHub (LDA,X,SOM,+)
GitHub (BERT,X,TSNE,X)
GitHub (BERT,X,SOM,X)


In [27]:
df_heatmap_perception.to_csv("Results_Heatmap_Perception_Final.csv")