In [3]:
import clustering_metrics as cm
import pandas as pd
import numpy as np
import pickle
import json
import nltk
import util
import imp
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split

# Load data
- here we use the first extensive clustering in clustering.ipynb to later filter the best clustering algorith /configuration combinations
- for some metrics we need unclustered papers thats why we only use clusterings of papers until 2019

In [4]:
# Load anthology dataframe
df = pd.read_csv("data/anthology_conferences.csv", sep="|", keep_default_na=False,
                 converters={"semantic_scholar_keywords":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_syntactic":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_semantic":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_union":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_enhanced":lambda x: x.strip("[]").replace("'", "").split(", ")})
df[:2]

Unnamed: 0,url,publisher,address,year,month,editor,title,ENTRYTYPE,ID,pages,...,note,pdf,abstract,semantic_scholar,semantic_scholar_authorIds,semantic_scholar_keywords,cso_syntactic,cso_semantic,cso_union,cso_enhanced
0,https://www.aclweb.org/anthology/2020.acl-main.1,Association for Computational Linguistics,Online,2020,July,,Learning to Understand Child-directed and Adul...,inproceedings,gelderloos-etal-2020-learning,1--6,...,,2020.acl-main.1.pdf,Speech directed to children differs from adult...,2020.acl-main.1.json,"['7805500', '2756960', '103538973']","[1017215, 1588157]","[linguistics, acoustics, language acquisition,...","[speech signals, synthetic speech, linguistics...","[linguistics, automatic speech recognition, ac...","[speech recognition, signal processing, educat..."
1,https://www.aclweb.org/anthology/2020.acl-main.2,Association for Computational Linguistics,Online,2020,July,,Predicting Depression in Screening Interviews ...,inproceedings,rinaldi-etal-2020-predicting,7--18,...,,2020.acl-main.2.pdf,Despite the pervasiveness of clinical depressi...,2020.acl-main.2.json,"['19320780', '2457504', '37202877']",[8505],"[linguistics, pattern languages, psycholinguis...","[latent variable, latent factor, linguistics, ...","[latent factor, linguistics, dialogue, pattern...","[matrix factorizations, argumentation, speech ..."


In [5]:
# Load clustering results up to 2019
with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_2019.json") as jf:
    config2clusters_2019 = json.load(jf)

In [6]:
# Load embeddings
texts, embeddings = util.load_embeddings("paraphrase-mpnet-base-v2_titles_abstracts.pkl")

In [7]:
# Seperate dataframe in papers that were used for clustering and in those not used
last_year = 2019
df_clustered = df[df["year"] <= last_year].reset_index()
df_not_clustered = df[df["year"] > last_year].reset_index()

In [8]:
# Load mappping from semantic scholar topic ids to their topics
with open("data/semantic_scholar/topicId_mapping.json") as jf:
    id2topic_sem_scholar = json.load(jf)

In [9]:
# Load list of cso nlp classes
with open("data/cso_nlp_hierarchie.json") as jf:
    cso_nlp_hier = json.load(jf)

# Get only nlp topics
# !!! not used - can be used to show and use only nlp topics later
cso_nlp_classes = []
def get_nlp_topics_cso(dict_to_search):
    for cso_topic in dict_to_search:
        cso_topic_cleaned = cso_topic.replace("-", " ")
        cso_topic_cleaned = cso_topic_cleaned.replace("_", " ")
        
        if cso_topic_cleaned not in cso_nlp_classes:
            cso_nlp_classes.append(cso_topic_cleaned)
            
        get_nlp_topics_cso(dict_to_search[cso_topic])
        
get_nlp_topics_cso(cso_nlp_hier['nlp'])
cso_nlp_classes

['abstracting and indexing',
 'subject headings',
 'parse trees',
 'part of speech',
 'pos tagging',
 'pos taggers',
 'part of speech tagging',
 'natural language text',
 'natural language understanding',
 'lexical resources',
 'wordnet',
 'topic model',
 'hierarchical dirichlet process',
 'text processing',
 'word processing',
 'electronic document',
 'style sheets',
 'electronic documents',
 'text mining',
 'text mining techniques',
 'text document',
 'textual data',
 'text data',
 'text representation',
 'text summarization',
 'automatic text summarization',
 'automatic summarization',
 'sentence extraction',
 'term frequency',
 'inverse document frequency',
 'document frequency',
 'document classification',
 'textual entailment',
 'sentiment classification',
 'text clustering',
 'text classification',
 'text classifiers',
 'reuters 21578',
 'term weighting',
 'text classification methods',
 'document categorization',
 'text feature',
 'training documents',
 'text categorization',
 

# Evaluation with and without classification
In this section, we learn a classifier on the clusters and predict the cluster for new, unclustered embeddings (i.e. paper of not used years).

## 1. Select a good classifier
- Load one clustering result
- Try different sklearn classifiers
- Training set: 0.8 of clustered embeddings
- Testing set: 0.2 of clustered embeddings
- Try all classifiers

In [10]:
# Create classifiers
names = ["Nearest Neighbors 5", "Nearest Neighbors 25", "Nearest Neighbors 50", "Nearest Neighbors 75", "Nearest Neighbors 100", 
         "Nearest Neighbors 200", "Nearest Neighbors 300", "Nearest Neighbors 400", "Nearest Neighbors 500",
         "Linear SVM 0.015", "Linear SVM 0.02", "Linear SVM 0.025", "Linear SVM 0.055", "Linear SVM 0.075",
         "RBF SVM", "MLP 1", "MLP 2", "MLP 3", "MLP 4", "MLP 5", "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(5),
    KNeighborsClassifier(25),
    KNeighborsClassifier(50),
    KNeighborsClassifier(75),
    KNeighborsClassifier(100),
    KNeighborsClassifier(200),
    KNeighborsClassifier(300),
    KNeighborsClassifier(400),
    KNeighborsClassifier(500),
    SVC(kernel="linear", C=0.015),
    SVC(kernel="linear", C=0.02),
    SVC(kernel="linear", C=0.025),
    SVC(kernel="linear", C=0.05),
    SVC(kernel="linear", C=0.075),
    SVC(gamma=2, C=1),
    MLPClassifier(alpha=1, max_iter=1000),
    MLPClassifier(alpha=2, max_iter=1000),
    MLPClassifier(alpha=3, max_iter=1000),
    MLPClassifier(alpha=4, max_iter=1000),
    MLPClassifier(alpha=5, max_iter=1000),
    GaussianNB()]

In [11]:
# Select one clustering algorithm and a configuration
d = config2clusters_2019["paraphrase-mpnet-base-v2"]["title+abstract"]["kmeans"]["num_clusters"]["25"]["distance_thresholds"]["null"]["min_cluster_sizes"]["null"]["neighbors"]["null"]["components"]["null"]
cluster2indices = d["cluster2indices"]
labels = d["cluster2indices"]

In [12]:
# Create traning/test set
X = []
for i, row in df.iterrows():
    # only embeddings of papers with year <= last_year
    if row["year"] <= last_year:
        if -1 in cluster2indices and i in cluster2indices[-1]:
            continue
        X.append(embeddings[i])
        
y = [-2 for _ in range(len(X))]
for cluster in cluster2indices:
    if cluster != -1:
        indices = cluster2indices[cluster]
        for index in indices:
            y[index] = cluster
        
assert len(y) == len(X), "{} vs {}".format(len(y), len(X))
assert len([1 for x in y if x == -2]) == 0
assert -1 not in y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

In [13]:
best_score = 0
best_classifier_by_index = -1

for i, (classifier, name) in enumerate(zip(classifiers, names)):
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    
    print("Accuracy of classifier '" + name + "':", score)
    
    if score > best_score:
        best_score = score
        best_classifier_by_index = i

Accuracy of classifier 'Nearest Neighbors 5': 0.712852897473997
Accuracy of classifier 'Nearest Neighbors 25': 0.7734026745913819
Accuracy of classifier 'Nearest Neighbors 50': 0.7897473997028231
Accuracy of classifier 'Nearest Neighbors 75': 0.7956909361069836
Accuracy of classifier 'Nearest Neighbors 100': 0.7897473997028231
Accuracy of classifier 'Nearest Neighbors 200': 0.787890044576523
Accuracy of classifier 'Nearest Neighbors 300': 0.7800891530460624
Accuracy of classifier 'Nearest Neighbors 400': 0.7719167904903418
Accuracy of classifier 'Nearest Neighbors 500': 0.7618870728083209
Accuracy of classifier 'Linear SVM 0.015': 0.9075037147102526
Accuracy of classifier 'Linear SVM 0.02': 0.9097325408618128
Accuracy of classifier 'Linear SVM 0.025': 0.912704309063893
Accuracy of classifier 'Linear SVM 0.055': 0.9104754829123328
Accuracy of classifier 'Linear SVM 0.075': 0.9063893016344725
Accuracy of classifier 'RBF SVM': 0.08283803863298662
Accuracy of classifier 'MLP 1': 0.90676077

In [14]:
# Select best classifier for further evalution
print("Best classifier:", names[best_classifier_by_index])
clf = classifiers[best_classifier_by_index]

Best classifier: Linear SVM 0.025


## 2. Train and classify not clustered embeddings with best classifier and based on clustering data
- Do it for all clustering algorithms and configurations

In [15]:
pretrained_models = ["paraphrase-mpnet-base-v2"]
text_sets = ["title+abstract"]
cluster_algorithms = ["kmeans", "agglomerative", "topic"] #["kmeans", "agglomerative", "fast", "topic"]
num_clusters = {"kmeans":[10, 15, 20, 25, 30, 35, 40, 45, 50], "agglomerative": ["null", 10, 15, 20, 25, 30, 35, 40, 45, 50], "fast": ["null"], "topic":["null"]}
distance_thresholds = {"kmeans":["null"], "agglomerative": ["null", 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "fast": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "topic": ["null"]}
min_cluster_size = {"kmeans": ["null"], "agglomerative": ["null"], "fast": ["null"], "topic": [100, 150, 200, 250, 300, 350, 400, 500]}
neighbors = {"kmeans": ["null"], "agglomerative": ["null"], "fast": ["null"], "topic": [2, 5, 10, 20, 50, 75, 100, 200, 400]}
components = {"kmeans": ["null"], "agglomerative": ["null"], "fast": ["null"], "topic": [2, 5, 10, 15, 20, 30, 40, 60, 80, 100, int(len(embeddings[0])/4), int(len(embeddings[0])/2), len(embeddings[0])]}

In [16]:
config2data = dict()       # data: (cluster2indices, X_train, y_train, X_predict, y_predicted)

for pretrained_model in pretrained_models:   
    for text_set in text_sets:
        if text_set == "title+abstract":
            _, embeddings = load_embeddings(pretrained_model + "_titles_abstracts.pkl")
        else:
            _, embeddings = load_embeddings(pretrained_model + "_" + text_set + ".pkl")
            
        for algorithm in cluster_algorithms:
            for num_cluster in num_clusters[algorithm]:
                
                if algorithm == "agglomerative" and num_cluster != "null":
                    thresholds = ["null"]
                elif algorithm == "agglomerative" and num_cluster == "null":
                    thresholds = distance_thresholds[algorithm][1:]
                else:
                    thresholds = distance_thresholds[algorithm]
                
                for distance_threshold in thresholds:
                    for cluster_size in min_cluster_size[algorithm]:
                        for num_neighbors in neighbors[algorithm]:
                            for component in components[algorithm]:
                                
                                key = str(last_year) + " - " + pretrained_model + " - " + text_set + " - " + algorithm + " - " + str(num_cluster) + " - " + str(distance_threshold) + " - " + str(cluster_size) + " - " + str(num_neighbors) + " - " + str(component)
                                
                                #try:
                                cluster2indices, X_train, y_train, X_predict = util.get_classifier_data_sets(config2clusters_2019, df, 2019, embeddings, 
                                    pretrained_model, text_set, algorithm, num_cluster, distance_threshold, cluster_size, num_neighbors, component)
                                #except:
                                #    print("Error:", key)
                                
                                clf.fit(X_train, y_train)
                                y_predicted = clf.predict(X_predict)
                                
                                
                                config2data[key] = (cluster2indices, X_train, y_train, X_predict, y_predicted)

NameError: name 'load_embeddings' is not defined

## 3. Check with semantic scholar topics and cso topics if chosen clusters are appropriate
- All clustering algorithms and configurations are tested for current embeddings
- Store results of all evaluation metrics in a dataframe
- Filter appropriate configurations and store them

In [None]:
# create dataframe for _c_lustering _e_valuation with column 'config'
df_ce = pd.DataFrame([[c] for c in config2data], columns=["config"])
df_ce

In [None]:
def evaluate_metrics(df_ce, config2data):   
    df_ce["num_clusters"] = 0.0
    df_ce["min_members"] = 0.0
    df_ce["max_members"] = 0.0
    df_ce["mean_members"] = 0.0
    df_ce["median_members"] = 0.0

    top_n_shared_words = [5, 10, 15]
    for n in top_n_shared_words:
        df_ce["intra_similarity_sem_scholar_" + str(n)] = 0.0
        df_ce["inter_similarity_sem_scholar_" + str(n)] = 0.0
        df_ce["intra_similarity_cso_enhanced_" + str(n)] = 0.0
        df_ce["inter_similarity_cso_enhanced_" + str(n)] = 0.0

    df_ce["accuracy_new_papers_sem_scholar"] = 0.0
    df_ce["ranking_score_new_papers_sem_scholar"] = 0.0
    df_ce["absolute_score_new_papers_sem_scholar"] = 0.0
    df_ce["accuracy_new_papers_cso_enhanced"] = 0.0
    df_ce["ranking_score_new_papers_cso_enhanced"] = 0.0
    df_ce["absolute_score_new_papers_cso_enhanced"] = 0.0
    
    df_ce["papers_clustered"] = 0.0

    for i, row in df_ce.iterrows():
        c = row["config"]
        (cluster2indices, X_train, y_train, X_predict, y_predicted) = config2data[c]

        # simple metrics
        df_ce.at[i, "num_clusters"] = cm.get_num_clusters(cluster2indices)
        df_ce.at[i, "min_members"] = cm.get_min_cluster_size(cluster2indices)
        df_ce.at[i, "max_members"] = cm.get_max_cluster_size(cluster2indices)
        df_ce.at[i, "mean_members"] = cm.get_mean_cluster_size(cluster2indices)
        df_ce.at[i, "median_members"] = cm.get_median_cluster_size(cluster2indices)

        # semantic scholar keywords
        cluster2keywords_freq_dist = util.get_cluster2words_freq_dist(cluster2indices, "sem_scholar", df_clustered)
        # cso topics
        cluster2topics_freq_dist = util.get_cluster2words_freq_dist(cluster2indices, "cso", df_clustered)

        # intra cluster similarity keywords semantic scholar and topics cso: mean % of all clusters in a config
        n2mean_percentage_sem_scholar_keywords = cm.intra_cluster_similarity(cluster2indices, cluster2keywords_freq_dist, "sem_scholar", df_clustered, n=top_n_shared_words)
        for n in n2mean_percentage_sem_scholar_keywords:
            df_ce.at[i, "intra_similarity_sem_scholar_" + str(n)] = n2mean_percentage_sem_scholar_keywords[n]
        n2mean_percentage_cso_topics = cm.intra_cluster_similarity(cluster2indices, cluster2topics_freq_dist, "cso", df_clustered, n=top_n_shared_words)
        for n in n2mean_percentage_cso_topics:
            df_ce.at[i, "intra_similarity_cso_enhanced_" + str(n)] = n2mean_percentage_cso_topics[n]

        # inter cluster similarity key words semantic scholar and topics cso: mean of all cluster pairs
        if df_ce.loc[i, "num_clusters"] <= 500:
            n2mean_correlations_sem_scholar_keywords = cm.inter_cluster_similarity(cluster2indices, cluster2keywords_freq_dist, "sem_scholar", n=top_n_shared_words)
            for n in n2mean_correlations_sem_scholar_keywords:
                df_ce.at[i, "inter_similarity_sem_scholar_" + str(n)] = n2mean_correlations_sem_scholar_keywords[n]
        if df_ce.loc[i, "num_clusters"] <= 500:
            n2mean_correlations_cso_topics = cm.inter_cluster_similarity(cluster2indices, cluster2topics_freq_dist, "cso", n=top_n_shared_words)
            for n in n2mean_correlations_cso_topics:
                df_ce.at[i, "inter_similarity_cso_enhanced_" + str(n)] = n2mean_correlations_cso_topics[n]

        # new unclustered accuracy by semantic scholar keywords and cso topics
        # and score for place of correct cluster in list of clusters sorted by 
        # keyword similarity / topic similarity score for each paper
        accuracy, ranking_score, absolute_score = cm.classification_acc_ranking(y_predicted, cluster2keywords_freq_dist, "sem_scholar", cluster2indices, df_not_clustered)
        df_ce.at[i, "accuracy_new_papers_sem_scholar"] = accuracy
        df_ce.at[i, "ranking_score_new_papers_sem_scholar"] = ranking_score
        df_ce.at[i, "absolute_score_new_papers_sem_scholar"] = absolute_score
        accuracy, ranking_score, absolute_score = cm.classification_acc_ranking(y_predicted, cluster2topics_freq_dist, "cso", cluster2indices, df_not_clustered)
        df_ce.at[i, "accuracy_new_papers_cso_enhanced"] = accuracy
        df_ce.at[i, "ranking_score_new_papers_cso_enhanced"] = ranking_score
        df_ce.at[i, "absolute_score_new_papers_cso_enhanced"] = absolute_score
        
        num_papers = 0
        num_clustered = 0
        for cluster_index in cluster2indices:
            if cluster_index == -1 or cluster_index == "-1":
                num_papers += len(cluster2indices[cluster_index])
            else:
                num_papers += len(cluster2indices[cluster_index])
                num_clustered += len(cluster2indices[cluster_index])
                
        df_ce.at[i, "papers_clustered"] = num_clustered/num_papers

        #if (i+1) % 25 == 0:
        print(str(i+1) + "/" + str(len(df_ce)) + " configurations")

    return df_ce

In [None]:
df_ce = evaluate_metrics(df_ce, config2data)

In [40]:
# Store df_ce on disk
df_ce.to_csv("data/clusters/cluster_evaluation_" + pretrained_models[0] + "_" + text_sets[0] + ".csv", sep="|", index=False)

In [27]:
# Load df_ce from disk
df_ce = pd.read_csv("data/clusters/cluster_evaluation_" + pretrained_models[0] + "_" + text_sets[0] + ".csv", sep="|")
print(len(df_ce))

961


In [28]:
# Filter out configurations with too less or too many clusters but keep all kmeans and agglomerative clusterings
df_ce_filtered = df_ce[(df_ce["num_clusters"] >= 10.0)]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["num_clusters"] <= 50.0)]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["median_members"] >= 150.0) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["inter_similarity_cso_enhanced_15"].abs() <= 0.03) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["inter_similarity_sem_scholar_15"].abs() <= 0.03) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["inter_similarity_cso_enhanced_10"].abs() <= 0.03) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["inter_similarity_sem_scholar_10"].abs() <= 0.03) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["ranking_score_new_papers_sem_scholar"].abs() >= 0.3) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]
df_ce_filtered = df_ce_filtered[(df_ce_filtered["ranking_score_new_papers_cso_enhanced"].abs() >= 0.3) | (df_ce_filtered["config"].str.contains('.*kmeans.*|.*agglomerative.*'))]

In [29]:
# print sorted values of a metric
# df_ce_filtered.sort_values(by=['inter_similarity_cso_enhanced_10'])['inter_similarity_cso_enhanced_10']

In [30]:
print("Number of remaining clusterings:", len(df_ce_filtered))
for c in list(df_ce_filtered["config"]):
    print(c)
df_ce_filtered

Number of remaining clusterings: 30
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 10 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 15 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 20 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 25 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 30 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 35 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 40 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 45 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - kmeans - 50 - null - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstract - agglomerative - null - 0.7 - null - null - null
2019 - paraphrase-mpnet-base-v2 - title+abstra

Unnamed: 0,config,num_clusters,min_members,max_members,mean_members,median_members,intra_similarity_sem_scholar_5,inter_similarity_sem_scholar_5,intra_similarity_cso_enhanced_5,inter_similarity_cso_enhanced_5,...,intra_similarity_cso_enhanced_10,inter_similarity_cso_enhanced_10,intra_similarity_sem_scholar_15,inter_similarity_sem_scholar_15,intra_similarity_cso_enhanced_15,inter_similarity_cso_enhanced_15,accuracy_new_papers_sem_scholar,ranking_score_new_papers_sem_scholar,accuracy_new_papers_cso_enhanced,ranking_score_new_papers_cso_enhanced
0,2019 - paraphrase-mpnet-base-v2 - title+abstra...,10.0,231.0,1756.0,1345.7,1495.0,0.671405,0.015556,0.835039,0.131111,...,0.916281,0.064512,0.842505,-0.011508,0.942581,0.057143,0.185727,0.364594,0.323582,0.532052
1,2019 - paraphrase-mpnet-base-v2 - title+abstra...,15.0,225.0,1333.0,897.133333,959.0,0.701527,-0.017143,0.856118,0.038095,...,0.922405,0.012063,0.859994,0.003946,0.949055,-0.002483,0.167553,0.318922,0.288564,0.478497
2,2019 - paraphrase-mpnet-base-v2 - title+abstra...,20.0,159.0,1083.0,672.85,729.0,0.720542,0.123684,0.870013,0.110526,...,0.926564,0.046252,0.870194,-0.000526,0.952981,0.022218,0.165337,0.291825,0.296543,0.458566
3,2019 - paraphrase-mpnet-base-v2 - title+abstra...,25.0,149.0,973.0,538.28,563.0,0.736495,0.051,0.878603,0.05,...,0.932463,0.057172,0.880146,0.006845,0.958013,0.032905,0.154699,0.275315,0.288564,0.447454
4,2019 - paraphrase-mpnet-base-v2 - title+abstra...,30.0,148.0,769.0,448.566667,407.5,0.754377,0.001839,0.878668,0.028276,...,0.938208,-0.002382,0.891048,-0.001314,0.960402,0.002397,0.137855,0.250003,0.274823,0.425155
5,2019 - paraphrase-mpnet-base-v2 - title+abstra...,35.0,147.0,700.0,384.485714,367.0,0.749197,0.037479,0.874138,0.042521,...,0.935251,0.013822,0.888284,0.002953,0.95769,0.005696,0.144947,0.242658,0.280585,0.422815
6,2019 - paraphrase-mpnet-base-v2 - title+abstra...,40.0,44.0,602.0,336.425,341.0,0.751642,0.008333,0.869797,0.018333,...,0.93191,0.023792,0.88937,-0.002445,0.959232,0.007285,0.12633,0.216895,0.261968,0.399544
7,2019 - paraphrase-mpnet-base-v2 - title+abstra...,45.0,89.0,518.0,299.044444,307.0,0.753758,0.01899,0.86667,0.052525,...,0.931871,0.021022,0.887684,-0.000263,0.955677,0.023921,0.120567,0.209862,0.252216,0.386457
8,2019 - paraphrase-mpnet-base-v2 - title+abstra...,50.0,69.0,493.0,269.14,273.5,0.765297,0.01102,0.86922,0.02751,...,0.936448,0.009395,0.891706,0.000945,0.960146,0.004157,0.12234,0.208268,0.241135,0.374593
14,2019 - paraphrase-mpnet-base-v2 - title+abstra...,15.0,1.0,13124.0,897.133333,3.0,0.828453,-0.010476,0.82287,0.017143,...,0.908463,0.029683,0.930672,-0.002843,0.958558,0.024591,0.062943,0.460138,0.009309,0.234509


In [18]:
best_configs = []

for i, row in df_ce_filtered.iterrows():
    s = row["config"].split(" - ")
    best_configs.append({"algorithm": s[3], "num_clusters": s[4], "distance_threshold": s[5], "min_cluster_size": s[6], "neighbors": s[7], "components": s[8]})

# Store best configs
with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_2019_best_configs.json", "w") as jf:
    json.dump(best_configs, jf)

## 4. Use list of appropriate configurations to cluster all embedding variants 
- See notebook clustering.ipynb
- Results in "data/clusters/all_embeddings_all_text_sets_best_configs_2019.json"

## 5. Check with semantic scholar topics and cso topics if chosen clusters are appropriate
- All clustering algorithms and appropriate configurations are tested for all embeddings
- Store results of all metrics in a dataframe
- Find best ~3 configurations

In [20]:
config2data = dict()       # (cluster2indices, X_train, y_train, X_predict, y_predicted)

# Load results
with open("data/clusters/all_embeddings_all_text_sets_best_configs_2019.json") as jf:
    config2clusters_2019 = json.load(jf)
    
for pretrained_model in config2clusters_2019:
    for text_set in config2clusters_2019[pretrained_model]:
        for filename in os.listdir("data/embeddings/"):
            if pretrained_model in filename:
                if text_set == "title" and "titles.pkl" in filename:
                    texts, embeddings = util.load_embeddings(filename)
                    break
                elif text_set == "title+abstract" and "titles_abstracts.pkl" in filename:
                    texts, embeddings = util.load_embeddings(filename)
                    break
                elif text_set == "title+abstract_sent_mean" and "titles_abstracts_sent_mean.pkl" in filename:
                    texts, embeddings = util.load_embeddings(filename)
                    break
        
        for config in config2clusters_2019[pretrained_model][text_set]:
            algorithm = config["algorithm"]
            num_clusters = config["num_clusters"]
            threshold = config["distance_threshold"]
            min_cluster_size = config["min_cluster_size"]
            num_neighbors = config["neighbors"]
            components = config["components"]
            key = str(last_year) + " - " + filename + " - " + pretrained_model + " - " + text_set + " - " + algorithm + " - " + str(num_clusters) + " - " + str(threshold) + " - " + str(min_cluster_size) + " - " + str(num_neighbors) + " - " + str(components)
            print(key)

            cluster2indices = config["cluster2indices"]

            try:
                X_train, y_train, X_predict = util.get_classifier_data_sets_2(cluster2indices, df, 2019, embeddings, 
                    pretrained_model, text_set, algorithm, num_clusters, threshold, min_cluster_size, num_neighbors, components)
            except:
                print("Error:", key)

            try:
                clf.fit(X_train, y_train)
                y_predicted = clf.predict(X_predict)

                config2data[key] = (cluster2indices, X_train, y_train, X_predict, y_predicted)
            except:
                print(cluster2indices.keys())

2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 10 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 15 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 20 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 25 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 30 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 35 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 40 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 45 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - kmeans - 50 - null - null - null - null
2019 - allenai-specter_titles.pkl - allenai-specter - title - agglomerative - null

dict_keys(['0'])
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 10 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 15 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 20 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 25 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 30 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_mean - agglomerative - 35 - null - null - null - null
2019 - allenai-specter_titles_abstracts_sent_mean.pkl - allenai-specter - title+abstract_sent_m

2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - kmeans - 50 - null - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - agglomerative - null - 0.7 - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - agglomerative - 10 - null - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - agglomerative - 15 - null - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - agglomerative - 20 - null - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - agglomerative - 25 - null - null - null - null
2019 - paraphrase-distilroberta-ba

2019 - paraphrase-distilroberta-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-distilroberta-base-v2 - title+abstract_sent_mean - topic - null - null - 200 - 20 - 30
2019 - paraphrase-distilroberta-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-distilroberta-base-v2 - title+abstract_sent_mean - topic - null - null - 200 - 50 - 15
2019 - paraphrase-distilroberta-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-distilroberta-base-v2 - title+abstract_sent_mean - topic - null - null - 200 - 100 - 2
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 10 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 15 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 20 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 25 - null - null - null - null
2019 - 

2019 - paraphrase-mpnet-base-v2_titles_abstracts.pkl - paraphrase-mpnet-base-v2 - title+abstract - topic - null - null - 200 - 100 - 2
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-mpnet-base-v2 - title+abstract_sent_mean - kmeans - 10 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-mpnet-base-v2 - title+abstract_sent_mean - kmeans - 15 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-mpnet-base-v2 - title+abstract_sent_mean - kmeans - 20 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-mpnet-base-v2 - title+abstract_sent_mean - kmeans - 25 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - paraphrase-mpnet-base-v2 - title+abstract_sent_mean - kmeans - 30 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles_abstracts_sent_mean.pkl - parap

2019 - paraphrase-TinyBERT-L6-v2_titles.pkl - paraphrase-TinyBERT-L6-v2 - title - topic - null - null - 200 - 20 - 30
2019 - paraphrase-TinyBERT-L6-v2_titles.pkl - paraphrase-TinyBERT-L6-v2 - title - topic - null - null - 200 - 50 - 15
2019 - paraphrase-TinyBERT-L6-v2_titles.pkl - paraphrase-TinyBERT-L6-v2 - title - topic - null - null - 200 - 100 - 2
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract - kmeans - 10 - null - null - null - null
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract - kmeans - 15 - null - null - null - null
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract - kmeans - 20 - null - null - null - null
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract - kmeans - 25 - null - null - null - null
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 

2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 150 - 5 - 20
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 200 - 10 - 20
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 200 - 10 - 80
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 200 - 10 - 384
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 200 - 20 - 15
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mean.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract_sent_mean - topic - null - null - 200 - 20 - 30
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts_sent_mea

In [21]:
# Create dataframe for _c_lustering _e_valuation with column 'config'
df_ce = pd.DataFrame([[c] for c in config2data], columns=["config"])
df_ce

Unnamed: 0,config
0,2019 - allenai-specter_titles.pkl - allenai-sp...
1,2019 - allenai-specter_titles.pkl - allenai-sp...
2,2019 - allenai-specter_titles.pkl - allenai-sp...
3,2019 - allenai-specter_titles.pkl - allenai-sp...
4,2019 - allenai-specter_titles.pkl - allenai-sp...
...,...
352,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...
353,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...
354,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...
355,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...


In [27]:
# Evaluate metrics 
df_ce = evaluate_metrics(df_ce, config2data)

1/357 configurations
2/357 configurations
3/357 configurations
4/357 configurations
5/357 configurations
6/357 configurations
7/357 configurations
8/357 configurations
9/357 configurations
10/357 configurations
11/357 configurations
12/357 configurations
13/357 configurations
14/357 configurations
15/357 configurations
16/357 configurations
17/357 configurations
18/357 configurations
19/357 configurations
20/357 configurations
21/357 configurations
22/357 configurations
23/357 configurations
24/357 configurations
25/357 configurations
26/357 configurations
27/357 configurations
28/357 configurations
29/357 configurations
30/357 configurations
31/357 configurations
32/357 configurations
33/357 configurations
34/357 configurations
35/357 configurations
36/357 configurations
37/357 configurations
38/357 configurations
39/357 configurations
40/357 configurations
41/357 configurations
42/357 configurations
43/357 configurations
44/357 configurations
45/357 configurations
46/357 configuratio

In [28]:
print("Number of clusterings:", len(df_ce))
df_ce

357


Unnamed: 0,config,num_clusters,min_members,max_members,mean_members,median_members,intra_similarity_sem_scholar_5,inter_similarity_sem_scholar_5,intra_similarity_cso_enhanced_5,inter_similarity_cso_enhanced_5,...,inter_similarity_sem_scholar_15,intra_similarity_cso_enhanced_15,inter_similarity_cso_enhanced_15,accuracy_new_papers_sem_scholar,ranking_score_new_papers_sem_scholar,absolute_score_new_papers_sem_scholar,accuracy_new_papers_cso_enhanced,ranking_score_new_papers_cso_enhanced,absolute_score_new_papers_cso_enhanced,papers_clustered
0,2019 - allenai-specter_titles.pkl - allenai-sp...,10.0,834.0,1775.0,1345.700000,1392.0,0.685426,9.555556e-02,0.837812,0.028889,...,0.014127,0.943590,0.029206,0.204344,0.384592,0.131923,0.280142,0.479073,1.196837,1.000000
1,2019 - allenai-specter_titles.pkl - allenai-sp...,15.0,408.0,1551.0,897.133333,883.0,0.703512,2.952381e-02,0.853639,0.087619,...,-0.020714,0.953988,0.017959,0.162677,0.325285,0.153040,0.279699,0.452538,1.362130,1.000000
2,2019 - allenai-specter_titles.pkl - allenai-sp...,20.0,337.0,1102.0,672.850000,703.5,0.709529,9.578947e-02,0.855512,0.032632,...,0.001241,0.952921,0.033684,0.137855,0.276878,0.159520,0.253103,0.408462,1.384199,1.000000
3,2019 - allenai-specter_titles.pkl - allenai-sp...,25.0,145.0,856.0,538.280000,525.0,0.721358,1.866667e-01,0.861880,0.051333,...,0.027940,0.956241,0.046917,0.160461,0.271853,0.168357,0.241135,0.388858,1.410478,1.000000
4,2019 - allenai-specter_titles.pkl - allenai-sp...,30.0,131.0,758.0,448.566667,448.0,0.723910,8.804598e-02,0.864502,0.074713,...,0.033547,0.956618,0.033892,0.151596,0.249044,0.171595,0.225621,0.368594,1.427782,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,8.0,293.0,8298.0,1682.125000,544.5,0.775986,8.095238e-02,0.910386,-0.114286,...,0.036565,0.974500,-0.039966,0.079344,0.435984,0.131612,0.188830,0.402729,1.236817,0.383369
353,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,8.0,335.0,7859.0,1682.125000,561.5,0.770013,2.857143e-02,0.900655,-0.114286,...,0.066156,0.975629,-0.002381,0.088209,0.440594,0.133342,0.201241,0.409422,1.232118,0.415992
354,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,7.0,332.0,6903.0,1922.428571,602.0,0.780477,3.533333e-01,0.900561,-0.073333,...,0.046905,0.971056,-0.104524,0.122340,0.471950,0.131913,0.211436,0.449288,1.226350,0.487033
355,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,8.0,334.0,7699.0,1682.125000,615.0,0.772303,5.947623e-18,0.902216,-0.038095,...,0.109184,0.974839,0.015136,0.088652,0.441101,0.132390,0.196809,0.403238,1.221877,0.427881


In [29]:
# Store df_ce on disk
df_ce.to_csv("data/clusters/cluster_evaluation_all_models_all_text_sets.csv", sep="|", index=False)

In [40]:
# Load df_ce from disk
df_ce = pd.read_csv("data/clusters/cluster_evaluation_all_models_all_text_sets.csv", sep="|")

In [41]:
# Filter out configurations with too less or too many clusters
df_ce_filtered = df_ce[df_ce["num_clusters"] >= 10.0]
df_ce_filtered = df_ce_filtered[df_ce_filtered["num_clusters"] <= 50.0]
df_ce_filtered = df_ce_filtered[df_ce_filtered["median_members"] >= 200]
print(len(df_ce_filtered))

151


In [42]:
# Make absolute inter similarity score because of spearman correlation
df_ce_filtered['inter_similarity_sem_scholar_5'] = df_ce_filtered['inter_similarity_sem_scholar_5'].abs()
df_ce_filtered['inter_similarity_sem_scholar_10'] = df_ce_filtered['inter_similarity_sem_scholar_10'].abs()
df_ce_filtered['inter_similarity_sem_scholar_15'] = df_ce_filtered['inter_similarity_sem_scholar_15'].abs()
df_ce_filtered['inter_similarity_cso_enhanced_5'] = df_ce_filtered['inter_similarity_cso_enhanced_5'].abs()
df_ce_filtered['inter_similarity_cso_enhanced_10'] = df_ce_filtered['inter_similarity_cso_enhanced_10'].abs()
df_ce_filtered['inter_similarity_cso_enhanced_15'] = df_ce_filtered['inter_similarity_cso_enhanced_15'].abs()

In [43]:
print("Number of remaining clusterings:", len(df_ce_filtered))
df_ce_filtered

151


Unnamed: 0,config,num_clusters,min_members,max_members,mean_members,median_members,intra_similarity_sem_scholar_5,inter_similarity_sem_scholar_5,intra_similarity_cso_enhanced_5,inter_similarity_cso_enhanced_5,...,inter_similarity_sem_scholar_15,intra_similarity_cso_enhanced_15,inter_similarity_cso_enhanced_15,accuracy_new_papers_sem_scholar,ranking_score_new_papers_sem_scholar,absolute_score_new_papers_sem_scholar,accuracy_new_papers_cso_enhanced,ranking_score_new_papers_cso_enhanced,absolute_score_new_papers_cso_enhanced,papers_clustered
0,2019 - allenai-specter_titles.pkl - allenai-sp...,10.0,834.0,1775.0,1345.700000,1392.0,0.685426,0.095556,0.837812,0.028889,...,0.014127,0.943590,0.029206,0.204344,0.384592,0.131923,0.280142,0.479073,1.196837,1.0
1,2019 - allenai-specter_titles.pkl - allenai-sp...,15.0,408.0,1551.0,897.133333,883.0,0.703512,0.029524,0.853639,0.087619,...,0.020714,0.953988,0.017959,0.162677,0.325285,0.153040,0.279699,0.452538,1.362130,1.0
2,2019 - allenai-specter_titles.pkl - allenai-sp...,20.0,337.0,1102.0,672.850000,703.5,0.709529,0.095789,0.855512,0.032632,...,0.001241,0.952921,0.033684,0.137855,0.276878,0.159520,0.253103,0.408462,1.384199,1.0
3,2019 - allenai-specter_titles.pkl - allenai-sp...,25.0,145.0,856.0,538.280000,525.0,0.721358,0.186667,0.861880,0.051333,...,0.027940,0.956241,0.046917,0.160461,0.271853,0.168357,0.241135,0.388858,1.410478,1.0
4,2019 - allenai-specter_titles.pkl - allenai-sp...,30.0,131.0,758.0,448.566667,448.0,0.723910,0.088046,0.864502,0.074713,...,0.033547,0.956618,0.033892,0.151596,0.249044,0.171595,0.225621,0.368594,1.427782,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,30.0,94.0,651.0,448.566667,486.0,0.706797,0.005517,0.855646,0.077471,...,0.014433,0.953812,0.024080,0.127660,0.231352,0.174889,0.273493,0.426184,1.519676,1.0
332,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,35.0,138.0,639.0,384.485714,394.0,0.713623,0.007731,0.863148,0.072605,...,0.001555,0.956959,0.022923,0.107713,0.209076,0.180362,0.222961,0.381110,1.525443,1.0
333,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,40.0,61.0,611.0,336.425000,329.5,0.721974,0.007051,0.859576,0.075513,...,0.004574,0.955896,0.023466,0.106383,0.195444,0.182329,0.272163,0.402407,1.553058,1.0
334,2019 - paraphrase-TinyBERT-L6-v2_titles_abstra...,45.0,115.0,621.0,299.044444,308.0,0.737014,0.019293,0.867625,0.021010,...,0.004708,0.959419,0.021670,0.107713,0.193764,0.188680,0.250887,0.385271,1.580437,1.0


In [97]:
# Show best config for each metric
print(df_ce_filtered.sort_values(by=['accuracy_new_papers_sem_scholar'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['accuracy_new_papers_cso_enhanced'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['ranking_score_new_papers_sem_scholar'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['ranking_score_new_papers_cso_enhanced'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['absolute_score_new_papers_sem_scholar'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['absolute_score_new_papers_cso_enhanced'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_sem_scholar_5'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_sem_scholar_10'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_sem_scholar_15'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_cso_enhanced_5'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_cso_enhanced_10'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['intra_similarity_cso_enhanced_15'], ascending=False).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_sem_scholar_5']).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_sem_scholar_10']).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_sem_scholar_15']).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_cso_enhanced_5']).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_cso_enhanced_10']).iloc[0]["config"])
print(df_ce_filtered.sort_values(by=['inter_similarity_cso_enhanced_15']).iloc[0]["config"])

2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 10 - null - null - null - null
2019 - allenai-specter_titles_abstracts.pkl - allenai-specter - title+abstract - kmeans - 10 - null - null - null - null
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - topic - null - null - 200 - 20 - 30
2019 - allenai-specter_titles_abstracts.pkl - allenai-specter - title+abstract - kmeans - 10 - null - null - null - null
2019 - allenai-specter_titles_abstracts.pkl - allenai-specter - title+abstract - kmeans - 45 - null - null - null - null
2019 - paraphrase-distilroberta-base-v2_titles_abstracts.pkl - paraphrase-distilroberta-base-v2 - title+abstract - kmeans - 50 - null - null - null - null
2019 - paraphrase-TinyBERT-L6-v2_titles_abstracts.pkl - paraphrase-TinyBERT-L6-v2 - title+abstract - topic - null - null - 100 - 400 - 30
2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - topic - null - null - 100 - 

In [44]:
# Manually check metrics, one metric should not filter too many configurations
df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_sem_scholar_5"] <= 0.1]
df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_sem_scholar_10"] <= 0.1]
df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_sem_scholar_15"] <= 0.1]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_cso_enhanced_5"] <= 0.1]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_cso_enhanced_10"] <= 0.2]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["inter_similarity_cso_enhanced_15"] <= 0.2]

df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_sem_scholar_5"] >= 0.72]
df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_sem_scholar_10"] >= 0.8]
df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_sem_scholar_15"] >= 0.8]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_cso_enhanced_5"] >= 0.8]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_cso_enhanced_10"] >= 0.9]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["intra_similarity_cso_enhanced_15"] >= 0.9]

df_ce_filtered = df_ce_filtered[df_ce_filtered["accuracy_new_papers_sem_scholar"] >= 0.15]
df_ce_filtered = df_ce_filtered[df_ce_filtered["ranking_score_new_papers_sem_scholar"] >= 0.3]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["absolute_score_new_papers_sem_scholar"] >= 0.1]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["accuracy_new_papers_cso_enhanced"] >= 0.1]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["ranking_score_new_papers_cso_enhanced"] >= 0.2]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["absolute_score_new_papers_cso_enhanced"] >= 1.3]
#df_ce_filtered = df_ce_filtered[df_ce_filtered["absolute_score_new_papers_cso_enhanced"] >= 1.0]

df_ce_filtered = df_ce_filtered[df_ce_filtered["papers_clustered"] >= 0.5]

In [45]:
print("Number of remaining clusterings:", len(df_ce_filtered))
print(list(df_ce_filtered["config"]))
df_ce_filtered

4
['2019 - allenai-specter_titles_abstracts.pkl - allenai-specter - title+abstract - kmeans - 15 - null - null - null - null', '2019 - paraphrase-distilroberta-base-v2_titles.pkl - paraphrase-distilroberta-base-v2 - title - kmeans - 20 - null - null - null - null', '2019 - paraphrase-mpnet-base-v2_titles.pkl - paraphrase-mpnet-base-v2 - title - kmeans - 15 - null - null - null - null', '2019 - paraphrase-TinyBERT-L6-v2_titles.pkl - paraphrase-TinyBERT-L6-v2 - title - kmeans - 20 - null - null - null - null']


Unnamed: 0,config,num_clusters,min_members,max_members,mean_members,median_members,intra_similarity_sem_scholar_5,inter_similarity_sem_scholar_5,intra_similarity_cso_enhanced_5,inter_similarity_cso_enhanced_5,...,inter_similarity_sem_scholar_15,intra_similarity_cso_enhanced_15,inter_similarity_cso_enhanced_15,accuracy_new_papers_sem_scholar,ranking_score_new_papers_sem_scholar,absolute_score_new_papers_sem_scholar,accuracy_new_papers_cso_enhanced,ranking_score_new_papers_cso_enhanced,absolute_score_new_papers_cso_enhanced,papers_clustered
30,2019 - allenai-specter_titles_abstracts.pkl - ...,15.0,469.0,1282.0,897.133333,930.0,0.723567,0.02,0.857161,0.051429,...,0.027857,0.958907,0.002075,0.176862,0.320137,0.176693,0.308954,0.496687,1.438828,1.0
89,2019 - paraphrase-distilroberta-base-v2_titles...,20.0,286.0,1070.0,672.85,695.0,0.727509,0.008947,0.875047,0.000526,...,0.013365,0.952762,0.009004,0.18617,0.312755,0.171167,0.302305,0.444324,1.482432,1.0
178,2019 - paraphrase-mpnet-base-v2_titles.pkl - p...,15.0,329.0,1397.0,897.133333,858.0,0.729748,0.064762,0.863383,0.07619,...,0.022211,0.949832,0.030646,0.164007,0.332899,0.146624,0.236702,0.405862,1.345371,1.0
269,2019 - paraphrase-TinyBERT-L6-v2_titles.pkl - ...,20.0,150.0,1078.0,672.85,747.0,0.751635,0.032105,0.875585,0.019474,...,0.015301,0.95704,0.002914,0.156915,0.306498,0.177697,0.258422,0.409958,1.445785,1.0


In [109]:
# Best model, text set, algorithm and config
# Select this one because all papers are classified, other metrics where very good
print(df_ce_filtered.loc[89]["config"])
df_ce_filtered.loc[89]

2019 - paraphrase-distilroberta-base-v2_titles.pkl - paraphrase-distilroberta-base-v2 - title - kmeans - 20 - null - null - null - null


config                                    2019 - paraphrase-distilroberta-base-v2_titles...
num_clusters                                                                           20.0
min_members                                                                           286.0
max_members                                                                          1070.0
mean_members                                                                         672.85
median_members                                                                        695.0
intra_similarity_sem_scholar_5                                                     0.727509
inter_similarity_sem_scholar_5                                                     0.008947
intra_similarity_cso_enhanced_5                                                    0.875047
inter_similarity_cso_enhanced_5                                                    0.000526
intra_similarity_sem_scholar_10                                                 