<a href="https://colab.research.google.com/github/branjbar/AIDA/blob/master/python-code/corpus-to-map/rixai_util_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [210]:
%%capture
!pip install -U sentence-transformers

In [211]:
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [212]:
import warnings, os
import json
import pandas as pd, numpy as np
import matplotlib.cm as cm
import seaborn as sns
import spacy
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [213]:
PROJECTS_PATH = "/content/drive/My Drive/Freelancing/rixai/projects"

In [215]:
# read input file
def read_data(data_folder_path):
    try:
        df = pd.read_csv(os.path.join(data_folder_path, "data_raw.csv"))
    except:
        df = pd.read_csv(os.path.join(data_folder_path, "data_raw.csv"), sep=";")
    print("%i rows with %i columns is read from %s" % (df.shape[0], df.shape[1], data_folder_path))
    return df

In [216]:
# standardize columns names and fill missing
def standardize_column_names(df):
    c_list = df.columns
    new_c_list = []
    for c in c_list:
        c = c.replace("(","").replace(")","").replace(".","").replace("-","_").replace("/","_")
        c = c.replace("_"," ")
        c = ' '.join(c.split())
        new_c_list.append('_'.join(c.lower().split(" ")))
    df.columns = new_c_list
    if len(set(new_c_list)) != len(new_c_list):
        print("warning: duplicate column names are generated.")

    if "publication_id" not in df.columns:
        df['uid'] = "pub.mde." + df_tabular.index.astype(str)
    else:
        df['uid'] = df["publication_id"]

    print("column names standardized")
    return df

In [217]:
# fix data
def fix_data(df_tabular):

    df_tabular_fixed = df_tabular[df_tabular["uid"].str.startswith('pub.')==True]    
    df_tabular_fixed = df_tabular_fixed.drop_duplicates(subset=["uid"]).copy()
    
    
    print("%i rows removed to fix data." % (df_tabular.shape[0] - df_tabular_fixed.shape[0]))
    return df_tabular_fixed

In [218]:
# melt data
def melt_data(df):
    df = df.melt(id_vars=["uid"])
    df.columns = ["uid","feature","value"]
    print("data is melted")
    return df

In [219]:
def add_text_feature(df, df_1, df_2):

    df_1 = df_1.copy()
    df_2 = df_2.copy()

    df_1.value = df_1.value.fillna("")
    df_2.value = df_2.value.fillna("")
    df_3 = df_1.merge(df_2, on="uid", how='outer')
    df_3["value"] = (df_3.value_x + ". " + df_3.value_y).str.strip()
    df_3["feature"] = "text"
    df_3 = df_3[["uid","feature","value"]] 


    replace_from = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)"
    replace_to = "Lecture Notes in Computer Science"
    df_3.value = df_3.value.apply(lambda x: x.replace(replace_from, replace_to))
    
    return pd.concat([df, df_3])

In [220]:
# get basic statistics
def get_statistics(df):
    dict_stat = {
        'df_size': df.shape,
        'df_num_uid': df.uid.nunique(),
        'df_num_features': df.feature.nunique(),
    }
    
    return dict_stat

In [221]:
# add keywords
def get_keyword_embeddings(data_folder_path):
    keyword_embedding_file_path = os.path.join(data_folder_path,"data_keyword_embedding.pkl")
    keyword_file_path = os.path.join(data_folder_path,"data_keyword.csv")
    sample_size = 5000

    try:
        df_keyword = pd.read_pickle(keyword_embedding_file_path)
        print("keyword embedding loaded from file.")

    except:
        try:
            df_keyword = pd.read_csv(keyword_file_path, sep="\t")
            df_keyword = df_keyword.append({"keyword": ''}, ignore_index=True).reset_index()
            df_keyword['key'] = 0
            df_keyword_triple = df_keyword.merge(df_keyword, how='outer', on="key").merge(df_keyword, how='outer', on="key")
            df_keyword_triple = df_keyword_triple[(df_keyword_triple.keyword_x < df_keyword_triple.keyword_y) & (df_keyword_triple.keyword_y < df_keyword_triple.keyword)]
            df_keyword_triple["keyword"] = df_keyword_triple["keyword_x"] + " - " + df_keyword_triple["keyword_y"] + " - " + df_keyword_triple["keyword"]
            df_keyword = df_keyword_triple["keyword"].sample(sample_size, random_state=0).reset_index()


            print("generating keyword embeddings for the first time")
            model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
            embedding_list = []
            n = 10
            step = int(df_keyword.shape[0] / n)
            print("total: %i, steps: %i" % (df_keyword.shape[0],n))
            for i in range(n):
                print(i * step,"-", (i+1)*step-1)
                text_list = df_keyword.loc[i*step:(i+1)*step-1, "keyword"].values
                embedding_list = embedding_list + model.encode(text_list)
            print((i+1) * step,"-", "end")
            text_list = df_keyword.loc[(i+1)*step:, "keyword"].values
            embedding_list = embedding_list + model.encode(text_list)
            df_keyword["embedding"] = embedding_list
            df_keyword[["keyword", "embedding"]].to_pickle(keyword_embedding_file_path)
        except:
            print("cannot import keyword embedding")
            return pd.DataFrame()
        
    return df_keyword

In [222]:
# get text embeddings

def add_text_embeddings(df, data_folder_path):

    #45 seconds for 7122 titles
    text_embedding_file_path = os.path.join(data_folder_path,"data_text_embedding.pkl")
    df_text = df[df.feature=='text'][["uid", 'value']]

    try:
        df_text_embedding = pd.read_pickle(text_embedding_file_path)
        print("text embedding loaded from file.")
        df_text_embedding.columns = ["uid", "value"]
        df_text_embedding["feature"] = "text_embedding"

    except:
        print("generating embeddings for the first time")
        model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
        embedding_list = []
        n = 10
        step = int(df_text.shape[0] / n)
        print("total: %i, steps: %i" % (df_text.shape[0],n))
        for i in range(n):
            print(i * step,"-", (i+1)*step-1)
            text_list = df_text.loc[i*step:(i+1)*step-1, "value"].values
            embedding_list = embedding_list + model.encode(text_list)
        print((i+1) * step,"-", "end")
        text_list = df_text.loc[(i+1)*step:, "value"].values
        embedding_list = embedding_list + model.encode(text_list)
        df_text["embedding"] = embedding_list

        df_text_embedding = df_text[["uid", "embedding"]]
        
        df_text_embedding.to_pickle(text_embedding_file_path)
        df_text_embedding.columns = ["uid", "value"]
        df_text_embedding["feature"] = "text_embedding"
    
    df_text_embedding = df_text_embedding[df_text_embedding.uid.isin(df.uid)]
    df_text_embedding = df_text_embedding.drop_duplicates(['uid'])
    return pd.concat([df, df_text_embedding])

In [223]:
# do pca transfromation
def pca_fit_transform(df, data_folder_path):
    pca_2d_file = os.path.join(data_folder_path, "model_pca_2d.pkl")
    pca_1d_file = os.path.join(data_folder_path, "model_pca_1d.pkl")
    
    df_embedding = df[df.feature=="text_embedding"].copy()

    if os.path.isfile(pca_2d_file):
        pca_2d = joblib.load(pca_2d_file)
        print("Model for PCA 2D is loaded from file.")
    else:
        pca_2d = PCA(n_components=2, whiten=True, random_state=0)
        pca_2d.fit(np.asarray([x for x in df_embedding["value"]]))    
        joblib.dump(pca_2d, pca_2d_file)
        print("Model for PCA 2D is stored.")
    
    pca_2d_vectors = pca_2d.transform(np.asarray([x for x in df_embedding["value"]]))    
    
    if os.path.isfile(pca_1d_file):
        pca_1d = joblib.load(pca_1d_file)
        print("Model for PCA 1D is loaded from file.")
    else:
        pca_1d = PCA(n_components=1, whiten=True, random_state=0)
        pca_1d.fit(np.asarray([x for x in df_embedding["value"]]))    
        joblib.dump(pca_1d, pca_1d_file)
        print("Model for PCA 2D is stored.")
   
    pca_1d = joblib.load(pca_1d_file)
    pca_1d_vectors = pca_1d.transform(np.asarray([x for x in df_embedding["value"]]))    
    
    df_pca_2d_x = df_embedding[["uid"]].copy()
    df_pca_2d_x["feature"] = "x_1"
    df_pca_2d_x["value"] = pca_2d_vectors[:,0]
    
    df_pca_2d_y = df_embedding[["uid"]].copy()
    df_pca_2d_y["feature"] = "y_1"
    df_pca_2d_y["value"] = pca_2d_vectors[:,1]

    df_pca_1d = df_embedding[["uid"]].copy()
    df_pca_1d["feature"] = "y_2"
    df_pca_1d["value"] = pca_1d_vectors[:,0]

    return pd.concat([df, df_pca_2d_x, df_pca_2d_y, df_pca_1d])


In [224]:
# do clustering
def calcualte_clusters(df, num_cluster):
 
    df_embedding = df[df.feature=="text_embedding"].copy()
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(np.asarray([x for x in df_embedding["value"]]))    

    df_cluster = df_embedding[["uid"]].copy()
    df_cluster["feature"] = "cluster_1"
    df_cluster["value"] = kmeans.labels_

    return pd.concat([df, df_cluster])


In [225]:
# do clustering
def train_classifier(df):
 
    classifier_lr_file = os.path.join(data_folder_path, "model_lr.pkl")
    
    df_embedding = df[df.feature=="text_embedding"].copy()

    if os.path.isfile(classifier_lr_file):
        classifier_lr = joblib.load(classifier_lr_file)
        print("Model for Logistic Regression is loaded from file.")
    else:
        df_label = df[df.feature=="cluster_1"].copy()
        classifier_lr = LogisticRegression(
            random_state=0, 
            multi_class='multinomial',
            max_iter = 1000
            )
        classifier_lr.fit(np.asarray([x for x in df_embedding["value"]]), np.asarray([x for x in df_label["value"]]))
        joblib.dump(classifier_lr, classifier_lr_file)
        print("Model for Logistic Regression is stored.")
    
    


    df_class = df_embedding[["uid"]].copy()
    df_class["feature"] = "cluster_2"
    df_class["value"] = classifier_lr.predict(np.asarray([x for x in df_embedding["value"]]))

    return pd.concat([df, df_class])


In [226]:
# do tsne transformation

In [227]:
# do trend transformation

In [228]:
# do classification

In [229]:
# extract extra from_ features

In [230]:
def add_word_feature(df, top_n):
    spacy.load('en')
    en_stop = set(nltk.corpus.stopwords.words('english'))

    parser = spacy.lang.en.English()
    def tokenize(text):
        lda_tokens = []
        tokens = parser(text)
        for token in tokens:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME')
            else:
                lda_tokens.append(token.lower_)
        return lda_tokens
    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma
        
    def get_lemma2(word):
        return WordNetLemmatizer().lemmatize(word)

    def prepare_text_for_lda(text):
        tokens = tokenize(text)
        tokens = [token for token in tokens if len(token) > 4]
        tokens = [token for token in tokens if token not in en_stop]
        tokens = [get_lemma(token) for token in tokens]
        
        return ' '.join(tokens)
        
    df_title = df[df.feature=="title"].copy()
    df_title.value = df_title.value.fillna("")
    df_title["clean_text"] = df_title.value.apply(prepare_text_for_lda)

    vectorizer = TfidfVectorizer(min_df=.02, max_df=.8)
    tfidf = vectorizer.fit(df_title.clean_text)
    tfidf_matrix = tfidf.transform(df_title.clean_text)
    df_tfidf = pd.DataFrame({
        "vocab": list(tfidf.vocabulary_),
        "significance_sum": tfidf_matrix.mean(axis=0).tolist()[0]
    })
    df_tfidf = df_tfidf.set_index("vocab")

    list_df_tmp = []
    for w in df_tfidf.sort_values(by="significance_sum", ascending=False).index[:top_n]:
        df_tmp = df_title[df_title.clean_text.str.contains(w)]
        df_tmp = df_tmp[["uid"]]
        df_tmp["feature"] = "from_word_%s"%w
        df_tmp["value"] = 1
        list_df_tmp.append(df_tmp)
    return pd.concat([df] + list_df_tmp)

In [231]:
def add_x_trend(df):
    df_x_trend = df[df.feature=="pubyear"].copy()
    df_x_trend["value"] = df_x_trend["value"] + np.random.uniform(low=-.4, high=.4, size=(df_x_trend.shape[0],))
    df_x_trend["feature"] = "x_2"
    return pd.concat([df, df_x_trend])


In [246]:
def add_from_feature(df, feature_type, feature_name, top_n=10, do_split=False):

    df_feature = df[df.feature == feature_name].copy()
    df_feature.value = df_feature.value.fillna("").str.lower()

    df_feature["value_list"] = df_feature["value"].apply(lambda x: [x])

    if do_split:
        if df_feature["value"].fillna("").str.contains("\|").sum() > 10:
            df_feature["value_list"] = df_feature["value"].str.split("|")
        else:
            if df_feature["value"].fillna("").str.contains(";").sum() > 10:
                df_feature["value_list"] = df_feature["value"].str.split(";")
            else: 
                if df_feature["value"].fillna("").str.contains(",").sum() > 10:
                    df_feature["value_list"] = df_feature["value"].str.split(",")

    df_feature["value_list"] = df_feature["value_list"].apply(lambda x: list(set([i.strip() for i in x])))
    df_feature = df_feature.explode("value_list").reset_index(drop=True).copy()
    df_feature = df_feature[["uid","feature","value_list"]]
    df_feature.columns = ["uid","feature","value"]
    list_top_features = [f for f in df_feature.value.value_counts().index if len(f) > 1][:top_n]
    list_df = []
    for f in list_top_features:
        df_tmp = df_feature[df_feature.value==f].copy()
        df_tmp.feature = "from_" + feature_type + "_" + f.replace(" ","-")
        df_tmp.value = 1
        list_df.append(df_tmp)
    return pd.concat([df] + list_df)

In [240]:
def fix_column_names(df):
    if (df.feature=="year").sum() > 0 and (df.feature=="pubyear").sum() == 0:
        df.loc[df.feature=="year", "feature"] = "pubyear"
    return df

In [241]:
def get_data_json(df):
    df = df.drop_duplicates(subset=["uid","feature"], keep="first")
    df_tabular_final = df.reset_index().pivot(index='uid', columns='feature')['value'].reset_index()
    list_selecte_cols = [
            "uid",
            "pubyear",
            "source_title",
            "doi",
            "title",
            "dimensions_url",
            "x_1",
            "y_1",
            "x_2",
            "y_2",
            "cluster_1",
            "cluster_2",
            ] + [c for c in df_tabular_final.columns if 'from_' in c] 
        
    for c in list_selecte_cols:
        if c not in df_tabular_final.columns:
            df_tabular_final[c] = ""

    df_data_json = df_tabular_final[list_selecte_cols].copy()

    df_data_json["cluster_1"] = df_data_json["cluster_1"].fillna(-1)
    for c in [c for c in df_data_json.columns if c.startswith("from_")]:
        df_data_json[c] = df_data_json[c].fillna(0)
    
    df_data_json = fix_data(df_data_json)

    for c in df_data_json.columns:
        # print("%i null values in %s replaced with ''. type: %s" % (df_data_json[c].isna().sum(), c, df_data_json[c].dtypes))
        df_data_json[c] = df_data_json[c].fillna('')

    return df_data_json

In [242]:
def get_meta_data_json(df_data_json, project_number, project_name, cluster_names):

    df_meta_data_json = {
        "project_number": project_number,
        "project_name": project_name,
        "type": "rixai-network-complex",
        "num-records": df_data_json["uid"].nunique(),
    }

    # get number of clusters
    df_meta_data_json["cluster_list"] = list(set([c.split("_")[1] for c in df_data_json.columns if c.startswith("cluster_")]))
    for f in df_meta_data_json["cluster_list"]: 
        df_meta_data_json["list_cluster_%s"%f] = [c for c in df_data_json["cluster_%s"%f].unique() if c != -1]
    
    if len(cluster_names) > 0:
        df_meta_data_json["cluster_names"] = cluster_names
    else:
        df_meta_data_json["cluster_names"] = ["Topic %i"%c for c in df_meta_data_json["list_cluster_1"]]


    # get number of features
    df_meta_data_json["feature_list"] = list(set([c.split("_")[1] for c in df_data_json.columns if c.startswith("from_")]))
    for f in df_meta_data_json["feature_list"]: 
        list_tuple = []
        for e in [c for c in df_data_json.columns if c.startswith("from_%s"%f)]:
            list_tuple.append((e, df_data_json[e].sum()))
        list_tuple.sort(key=lambda tup: tup[1], reverse=True) 
        df_meta_data_json["list_%s"%f] = [t[0] for t in list_tuple]
        df_meta_data_json["listname_%s"%f] = ["%s (%i)" % (t[0],t[1]) for t in list_tuple]
    
    return df_meta_data_json

In [243]:
def export_json_for_js(df_data_json, df_meta_data_json, data_folder_path, download_file=True):
    json_file_name = os.path.join(data_folder_path, "data_for_js_p%i.json" % df_meta_data_json["project_number"])
    print(json_file_name)
    full_json = {
        "meta-data": df_meta_data_json,
        "data": df_data_json.to_dict(orient='records')
    }

    def convert(o):
        if isinstance(o, np.int64): return int(o)  
        raise TypeError
     
    with open(json_file_name, 'w') as json_file:
        json.dump(full_json, json_file, default=convert)

    if download_file:
        files.download(json_file_name) 
    return full_json

In [244]:
def export_json_config(json_config, data_folder_path):
    file_config_json = os.path.join(data_folder_path, "config.json")
    with open(file_config_json, 'w') as json_file:
        json.dump(json_config, json_file)

In [248]:
TOP_N = 20
for project_number in  range(4):
    print("\n")
    print("Processing Project: %i" % project_number)
    print("-" * 50)
    
    
    data_folder_name = "project-" + "0" * (3 - len(str(project_number))) + str(project_number) + "/data"
    data_folder_path = os.path.join(PROJECTS_PATH, data_folder_name)
    json_config_file = os.path.join(data_folder_path, "config.json")
    
    if os.path.isfile(json_config_file):        
        json_config = json.load(open(json_config_file,'r'))
    else:
        json_config = {
            "project_name": "Projcect Number %i" % project_number,
            "num_clusters": 5,
            "cluster_names": []
        }

    print(json_config)
    project_name = json_config.get("project_name","Projcect Number %i" % project_number)
    num_clusters = json_config.get("num_clusters",5)
    cluster_names = json_config.get("cluster_names", [])

    df_tabular = read_data(data_folder_path)
    df_tabular = standardize_column_names(df_tabular)
    df_tabular = fix_data(df_tabular)

    df = melt_data(df_tabular)
    df = fix_column_names(df)

    dict_stat = get_statistics(df)
    df_keyword = get_keyword_embeddings(data_folder_path)
    
    df = add_text_feature(df, df[df.feature=="source_title"], df[df.feature=="title"])
    df = add_text_embeddings(df, data_folder_path)
    
    df = pca_fit_transform(df, data_folder_path)
    df = calcualte_clusters(df, num_clusters)
    df = train_classifier(df)
    df = add_x_trend(df)

    df = add_from_feature(df, feature_type = "org", feature_name = "research_organizations_standardized", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "funding", feature_name = "funder", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "author", feature_name = "authors", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "country", feature_name = "country_of_research_organization", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "org", feature_name = "affiliation", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "keyword", feature_name = "author_keywords", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "keyword", feature_name = "index_keywords", top_n=TOP_N, do_split=True)
    df = add_from_feature(df, feature_type = "department", feature_name = "groep", top_n=TOP_N)
    df = add_from_feature(df, feature_type = "source", feature_name = "source_title", top_n=TOP_N, do_split=False)
    df = add_word_feature(df, top_n=TOP_N)


    df_data_json = get_data_json(df)
    df_meta_data_json = get_meta_data_json(df_data_json, project_number, project_name, cluster_names)

    full_json = export_json_for_js(df_data_json, df_meta_data_json, data_folder_path, download_file=True)

    export_json_config(json_config, data_folder_path)
    
    print(dict_stat)
    print(list(df_data_json.columns))
    print(df_meta_data_json)



Processing Project: 0
--------------------------------------------------
{'project_name': 'Dutch Research Landscape of COVID19', 'num_clusters': 5, 'cluster_names': []}
780 rows with 33 columns is read from /content/drive/My Drive/Freelancing/rixai/projects/project-000/data
column names standardized
0 rows removed to fix data.
data is melted
keyword embedding loaded from file.
text embedding loaded from file.
Model for PCA 2D is loaded from file.
Model for PCA 1D is loaded from file.
Model for Logistic Regression is loaded from file.
0 rows removed to fix data.
/content/drive/My Drive/Freelancing/rixai/projects/project-000/data/data_for_js_p0.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'df_size': (25740, 3), 'df_num_uid': 780, 'df_num_features': 33}
['uid', 'pubyear', 'source_title', 'doi', 'title', 'dimensions_url', 'x_1', 'y_1', 'x_2', 'y_2', 'cluster_1', 'cluster_2', 'from_author_bosch,-berend-jan', 'from_author_bouaziz,-bassem', 'from_author_de-mast,-quirijn', 'from_author_drosten,-christian', 'from_author_epstein,-monique', 'from_author_gargouri,-faiez', 'from_author_haagmans,-bart', 'from_author_haagmans,-bart-l.', 'from_author_jmail,-mohamed', 'from_author_koopmans,-marion', 'from_author_li,-wentao', 'from_author_mcaleer,-michael', 'from_author_meijer,-adam', 'from_author_molenkamp,-richard', 'from_author_netea,-mihai-g.', 'from_author_sanderman,-robbert', 'from_author_snijder,-eric-j.', 'from_author_van-crevel,-reinout', 'from_author_van-de-veerdonk,-frank-l.', 'from_author_wallinga,-jacco', 'from_country_australia', 'from_country_austria', 'from_country_belgium', 'from_country_brazil', 'from_country_canada', 'from_country_china', 'from_country_denmark', 'fr

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'df_size': (260000, 3), 'df_num_uid': 5000, 'df_num_features': 52}
['uid', 'pubyear', 'source_title', 'doi', 'title', 'dimensions_url', 'x_1', 'y_1', 'x_2', 'y_2', 'cluster_1', 'cluster_2', 'from_author_bakermans-kranenburg,-marian-j.', 'from_author_bakker,-arnold-b.', 'from_author_bal,-roland', 'from_author_de-koster,-rené', 'from_author_dekker,-rommert', 'from_author_frantzeskaki,-niki', 'from_author_frasincar,-flavius', 'from_author_mcaleer,-michael', 'from_author_paas,-fred', 'from_author_prinzie,-peter', 'from_author_scholten,-peter', 'from_author_serruys,-patrick', 'from_author_tiemeier,-henning', 'from_author_van-der-linden,-dimitri', 'from_author_van-doorslaer,-eddy', 'from_author_van-exel,-job', 'from_author_van-gog,-tamara', 'from_author_van-ijzendoorn,-marinus-h.', 'from_author_verhulst,-frank-c.', 'from_author_zwart,-h.-a.-e.', 'from_country_australia', 'from_country_austria', 'from_country_belgium', 'from_country_canada', 'from_country_china', 'from_country_denmark', 'fro

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'df_size': (37530, 3), 'df_num_uid': 834, 'df_num_features': 45}
['uid', 'pubyear', 'source_title', 'doi', 'title', 'dimensions_url', 'x_1', 'y_1', 'x_2', 'y_2', 'cluster_1', 'cluster_2', 'from_author_de-kok-a.g.', 'from_author_de-kok-t.', 'from_author_de-koster-r.', 'from_author_de-koster-r.b.m.', 'from_author_de-schutter-b.', 'from_author_dekker-r.', 'from_author_geerlings-h.', 'from_author_lodewijks-g.', 'from_author_negenborn-r.r.', 'from_author_ottjes-j.a.', 'from_author_pang-y.', 'from_author_schott-d.l.', 'from_author_tavasszy-l.', 'from_author_tavasszy-l.a.', 'from_author_van-woensel-t.', 'from_author_veenstra-a.w.', 'from_author_vis-i.f.a.', 'from_author_xin-j.', 'from_author_zuidwijk-r.', 'from_author_zuidwijk-r.a.', 'from_keyword_algorithms', 'from_keyword_automated-container-terminals', 'from_keyword_belt-conveyors', 'from_keyword_computer-simulation', 'from_keyword_container-terminal', 'from_keyword_container-terminals', 'from_keyword_containers', 'from_keyword_costs', 'f

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'df_size': (46644, 3), 'df_num_uid': 1014, 'df_num_features': 46}
['uid', 'pubyear', 'source_title', 'doi', 'title', 'dimensions_url', 'x_1', 'y_1', 'x_2', 'y_2', 'cluster_1', 'cluster_2', 'from_author_bioch-j.c.', 'from_author_collins-j.', 'from_author_de-bruin-a.', 'from_author_de-jong-f.', 'from_author_dekker-r.', 'from_author_frasincar-f.', 'from_author_hogenboom-a.', 'from_author_hogenboom-f.', 'from_author_huisman-d.', 'from_author_kaymak-u.', 'from_author_ketter-w.', 'from_author_kroon-l.', 'from_author_kroon-l.g.', 'from_author_milea-v.', 'from_author_paas-f.', 'from_author_schmidt-m.', 'from_author_schouten-k.', 'from_author_tan-y.-h.', 'from_author_van-gog-t.', 'from_author_vandic-d.', 'from_keyword_algorithms', 'from_keyword_article', 'from_keyword_artificial-intelligence', 'from_keyword_aspect-based-sentiment-analysis', 'from_keyword_commerce', 'from_keyword_data-mining', 'from_keyword_decision-making', 'from_keyword_decision-support-systems', 'from_keyword_disruption-mana