In [1]:
import clustering_algorithms as ca
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import json
import util
import os

In [2]:
# Load anthology with all information
df = pd.read_csv("data/anthology_conferences.csv", sep="|", keep_default_na=False,
                 converters={"semantic_scholar_keywords":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_syntactic":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_semantic":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_union":lambda x: x.strip("[]").replace("'", "").split(", "),
                            "cso_enhanced":lambda x: x.strip("[]").replace("'", "").split(", ")})
df[:2]

Unnamed: 0,url,publisher,address,year,month,editor,title,ENTRYTYPE,ID,pages,...,note,pdf,abstract,semantic_scholar,semantic_scholar_authorIds,semantic_scholar_keywords,cso_syntactic,cso_semantic,cso_union,cso_enhanced
0,https://www.aclweb.org/anthology/2020.acl-main.1,Association for Computational Linguistics,Online,2020,July,,Learning to Understand Child-directed and Adul...,inproceedings,gelderloos-etal-2020-learning,1--6,...,,2020.acl-main.1.pdf,Speech directed to children differs from adult...,2020.acl-main.1.json,"['7805500', '2756960', '103538973']","[1017215, 1588157]","[linguistics, acoustics, language acquisition,...","[speech signals, synthetic speech, linguistics...","[linguistics, automatic speech recognition, ac...","[speech recognition, signal processing, educat..."
1,https://www.aclweb.org/anthology/2020.acl-main.2,Association for Computational Linguistics,Online,2020,July,,Predicting Depression in Screening Interviews ...,inproceedings,rinaldi-etal-2020-predicting,7--18,...,,2020.acl-main.2.pdf,Despite the pervasiveness of clinical depressi...,2020.acl-main.2.json,"['19320780', '2457504', '37202877']",[8505],"[linguistics, pattern languages, psycholinguis...","[latent variable, latent factor, linguistics, ...","[latent factor, linguistics, dialogue, pattern...","[matrix factorizations, argumentation, speech ..."


In [5]:
# Filters the embeddings to these whose paper's year is <= last year
def filter_embeddings(embeddings, df, last_year=None):
    
    relevant_embeddings = []
    
    if last_year == None:
        relevant_embeddings = embeddings
    else:
        for i, row in df.iterrows():
            if row["year"] <= last_year:
                relevant_embeddings.append(embeddings[i])
                
    return relevant_embeddings

# Create clusters with the imported clustering algorithms
- store them in a json file for further testing and evaluation
- here we use only one embedding, i.e. paraphrase-mpnet-base-v2 with titles + abstracts (paraphrase-mpnet-base-v2_titles_abstracts.pkl)

In [4]:
_, embeddings = util.load_embeddings("paraphrase-mpnet-base-v2_titles_abstracts.pkl")

In [5]:
# Specify clustering parameters
pretrained_models = ["paraphrase-mpnet-base-v2"]
text_sets = ["title+abstract"]
cluster_algorithms = ["kmeans", "agglomerative", "topic"]
num_clusters = {"kmeans":[10, 15, 20, 25, 30, 35, 40, 45, 50], "agglomerative": [None, 10, 15, 20, 25, 30, 35, 40, 45, 50], "fast": [None], "topic":[None]}
distance_thresholds = {"kmeans":[None], "agglomerative": [None, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "fast": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "topic": [None]}
min_cluster_size = {"kmeans": [None], "agglomerative": [None], "fast": [None], "topic": [100, 150, 200, 250, 300, 350, 400, 500]}
neighbors = {"kmeans": [None], "agglomerative": [None], "fast": [None], "topic": [2, 5, 10, 20, 50, 75, 100, 200, 400]}
components = {"kmeans": [None], "agglomerative": [None], "fast": [None], "topic": [2, 5, 10, 15, 20, 30, 40, 60, 80, 100, int(len(embeddings[0])/4), int(len(embeddings[0])/2), len(embeddings[0])]}

In [6]:
# Execute the clustering algorithms based on the specified parameters
def extensive_clustering(df, last_year=None):
    
    # For each algorithm/config pair store the clustering results
    config2clusters = dict()
    
    # Use all pretrained models
    for pretrained_model in pretrained_models:
        config2clusters[pretrained_model] = dict()
        
        # combined with all text variants
        for text_set in text_sets:
            config2clusters[pretrained_model][text_set] = dict()

            # Load embeddings corresponding to pretrained_model and text_set
            if text_set == "title":
                _, embeddings = util.load_embeddings(pretrained_model + "_titles.pkl")
            elif text_set == "title+abstract":
                _, embeddings = util.load_embeddings(pretrained_model + "_titles_abstracts.pkl")
                
            # Filter embeddings
            embeddings = filter_embeddings(embeddings, df, last_year)
            
            # use all clustering algorithms
            for algorithm in cluster_algorithms:
                config2clusters[pretrained_model][text_set][algorithm] = dict()
                alg_dict = config2clusters[pretrained_model][text_set][algorithm]

                alg_dict["num_clusters"] = dict()

                # use all numbers of clusters
                for num_cluster in num_clusters[algorithm]:
                    alg_dict["num_clusters"][num_cluster] = dict()

                    alg_dict["num_clusters"][num_cluster]["distance_thresholds"] = dict()
                    
                    # special case agglomerative clustering
                    if algorithm == "agglomerative" and num_cluster != None:
                        thresholds = [None]
                    elif algorithm == "agglomerative" and num_cluster == None:
                        thresholds = distance_thresholds[algorithm][1:]
                    else:
                        thresholds = distance_thresholds[algorithm]

                    # use all distance thresholds
                    for distance_threshold in thresholds:
                        alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold] = dict()

                        alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"] = dict()

                        # use all cluster sizes
                        for cluster_size in min_cluster_size[algorithm]:
                            alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size] = dict()

                            alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size]["neighbors"] = dict()

                            # use all numbers of clusters
                            for num_neighbors in neighbors[algorithm]:
                                alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size]["neighbors"][num_neighbors] = dict()

                                alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size]["neighbors"][num_neighbors]["components"] = dict()

                                # use all number of components
                                for component in components[algorithm]:
                                    alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size]["neighbors"][num_neighbors]["components"][component] = dict()

                                    d = alg_dict["num_clusters"][num_cluster]["distance_thresholds"][distance_threshold]["min_cluster_sizes"][cluster_size]["neighbors"][num_neighbors]["components"][component]

                                    # clustering
                                    if algorithm == "kmeans":
                                        cluster2indices, labels, centers = ca.kmeans(embeddings, num_clusters=num_cluster, random_state=42)
                                    elif algorithm == "agglomerative":
                                        cluster2indices, labels = ca.agglomerative_clustering(embeddings, num_clusters=num_cluster, distance_threshold=distance_threshold)
                                    elif algorithm == "fast":
                                        cluster2indices, labels, centers = ca.fast_clustering(embeddings, threshold=distance_threshold)
                                    elif algorithm == "topic":
                                        cluster2indices, labels = ca.topic_clustering(embeddings, n_neighbors=num_neighbors, n_components=component, min_cluster_size=cluster_size)

                                    d["cluster2indices"] = cluster2indices
                                    d["labels"] = labels

                                    print("Finished alg={}, num_clusters={}, threshold={}, cluster_size={}, num_neighbors={}, n_components={}".format(algorithm, num_cluster, distance_threshold, cluster_size, num_neighbors, component))
    
    return config2clusters

In [7]:
config2clusters = extensive_clustering(df)
with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_None.json", "w") as jf:
    json.dump(config2clusters, jf)
    
config2clusters = extensive_clustering(df, last_year=2019)
with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_2019.json", "w") as jf:
    json.dump(config2clusters, jf)
    
config2clusters = extensive_clustering(df, last_year=2020)
with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_2020.json", "w") as jf:
    json.dump(config2clusters, jf)

NameError: name 'pretrained_models' is not defined

# Create clusterings with prefiltered clusterings 
- store them in a json file for further testing and evaluation
- here we use all embeddings and the configurations filtered in notebook cluster_evaluation.ipynb

In [8]:
# Execute the filtered clustering algorithms based on the filtered specified parameters
def extensive_clustering(df, last_year=None):

    # For each algorithm/config pair store the clustering results
    config2clusters = dict()

    # Load configs with best evaluation result on clustering results with embeddings
    # from 'paraphrase-mpnet-base-v2' with titles and abstracts
    with open("data/clusters/paraphrase-mpnet-base-v2_titles_abstracts_2019_best_configs.json") as jf:
        best_configs = json.load(jf)

    # Use all available embeddings
    for filename in os.listdir("data/embeddings/"):
        texts, embeddings = util.load_embeddings(filename)
        
        # Filter embeddings
        embeddings = filter_embeddings(embeddings, df, last_year=last_year)
        
        # Extract pretrainde model and text set used for embedding creation
        s = filename.split("_")
        pretrained_model = s[0]
        if s[1] == "titles" and s[2] == "abstracts" and s[3] == "sent":
            text_set = "title+abstract" + "_sent_mean"
        elif s[1] == "titles" and s[2] == "abstracts.pkl":
            text_set = "title+abstract"
        elif s[1] == "titles.pkl":
            text_set = "title"
        else:
            print("Warning", s)

        # Add pretrained model to dictionary
        if pretrained_model not in config2clusters:
            config2clusters[pretrained_model] = dict()
        # Add text set to pretrained model in dictionary   
        config2clusters[pretrained_model][text_set] = []
        d = config2clusters[pretrained_model][text_set]

        # Now use all configs in filtered configs
        for config in best_configs:
            algorithm = config["algorithm"]
            if config["num_clusters"] == "null":
                num_clusters = None
            else:
                num_clusters = int(config["num_clusters"])
            if config["distance_threshold"] == "null":
                threshold = None
            else:
                threshold = float(config["distance_threshold"])
            if config["min_cluster_size"] == "null":
                min_cluster_size = None
            else:
                min_cluster_size = int(config["min_cluster_size"])
            if config["neighbors"] == "null":
                neighbors = None
            else:
                neighbors = int(config["neighbors"])
            if config["components"] == "null":
                components = None
            else:
                components = int(config["components"])

            # Execute clustering
            if algorithm == "kmeans":
                cluster2indices, labels, centers = ca.kmeans(embeddings, num_clusters=num_clusters, random_state=42)
            elif algorithm == "agglomerative":
                cluster2indices, labels = ca.agglomerative_clustering(embeddings, num_clusters=num_clusters, distance_threshold=threshold)
                centers = []
            elif algorithm == "fast":
                cluster2indices, labels, centers = ca.fast_clustering(embeddings, threshold=threshold)
                centers = []
            elif algorithm == "topic":
                cluster2indices, labels = ca.topic_clustering(embeddings, n_neighbors=neighbors, n_components=components, min_cluster_size=min_cluster_size)
                centers = []

            c = config.copy()
            
            c.update({"cluster2indices": cluster2indices, "labels": labels, "centers": [list([float(y) for y in x]) for x in centers]})
            d.append(c) 
            
            print("Finished alg={}, num_clusters={}, threshold={}, cluster_size={}, num_neighbors={}, n_components={}".format(algorithm, num_clusters, threshold, min_cluster_size, neighbors, components))
                       
    return config2clusters

In [None]:
config2clusters = extensive_clustering(df, last_year=2019)
with open("data/clusters/all_embeddings_all_text_sets_best_configs_2019.json", "w") as jf:
        json.dump(config2clusters, jf)

config2clusters = extensive_clustering(df, last_year=2020)
with open("data/clusters/all_embeddings_all_text_sets_best_configs_2020.json", "w") as jf:
        json.dump(config2clusters, jf)

Finished alg=kmeans, num_clusters=10, threshold=None, cluster_size=None, num_neighbors=None, n_components=None


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_fast._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 2, in where
KeyboardInterrupt: 


Finished alg=kmeans, num_clusters=15, threshold=None, cluster_size=None, num_neighbors=None, n_components=None
Finished alg=kmeans, num_clusters=20, threshold=None, cluster_size=None, num_neighbors=None, n_components=None
Finished alg=kmeans, num_clusters=25, threshold=None, cluster_size=None, num_neighbors=None, n_components=None


# Perform selected final best clustering 

In [6]:
# embeddings: paraphrase-distilroberta-base-v2_titles.pkl
# algorithm: kmeans
# #clusters: 20
texts, embeddings = util.load_embeddings("paraphrase-distilroberta-base-v2_titles.pkl")
cluster2indices, labels, centers = ca.kmeans(embeddings, num_clusters=20, random_state=42)
results = {"cluster2indices": cluster2indices, "labels": labels, "centers": [list([float(y) for y in x]) for x in centers]}
with open("data/clusters/final_best_one_clustering.json", "w") as jf:
    json.dump(results, jf)