## Project 4:

### Purpose

### Usage

Upload this notebook or DBC archive on databricks platform

---------------

##  Task1.  Extract 
#### a. Fetching The 7z archive

**Skip this Section if you already have performed the extraction process and jump to checkpoint for pulling data from split json files.**

In [0]:
# Checking if archive is downloaded in memory.
try:
    dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")
    print("Archive in filesystem (file:/databricks/driver/dblp.v13.7z)")
except:
    # If archive is not in memory, Checking databricks store for cached version and pulling into memory.
    try:
        dbutils.fs.ls("dbfs:/FileStore/data/dblp.v13.7z")
        print("Archive located in FileStore. Copying into local store..")
        dbutils.fs.cp("dbfs:/FileStore/data/dblp.v13.7z", "file:/databricks/driver/dblp.v13.7z")
        print("Completed")
    except:
        # If archive is not cached, downloading and storing in databricks store.
        print("7z archive not found. Fetching from URL...")
        !wget https://originalstatic.aminer.cn/misc/dblp.v13.7z
        print("7z archive Downloaded. Moving archive to FileStore..")
        dbutils.fs.mkdirs("dbfs:/FileStore/data")
        dbutils.fs.cp("file:/databricks/driver/dblp.v13.7z", "dbfs:/FileStore/data/dblp.v13.7z")
        print("Completed.")

In [0]:
# The returned array should have one object of FileInfo with size =2568255035

dbutils.fs.ls("file:/databricks/driver/dblp.v13.7z")

### b. Extracting Archive into json chunks

#### b1. Extracting 7zip file into json.

In [0]:
!pip install py7zr -q

In [0]:
import py7zr

archive = py7zr.SevenZipFile('dblp.v13.7z', mode='r')
archive.extractall()
archive.close()

In [0]:
dbutils.fs.ls("file:/databricks/driver/dblpv13.json")

#### b2. Cleaning NumberInt(#) tags

The json data contains non-confirming tags, and so cannot be parsed as it is. We will read each line and substitute the tag. (This should take about 25 minutes)

In [0]:
import re

# Cleaning the `NumberInt` tag
fin = open(f"dblpv13.json")
fout = open(f"dblpv13_clean.json", "wt")
for line in fin:
    fout.write(re.sub(r"NumberInt\([\d]*\)", lambda x: "".join(re.findall(r"\d", x.group(0))), line))
fin.close()
fout.close()

#### b3. Partitioning Dataset into JSON files
Since the whopping 16 GB of json data cannot be loaded into memory directly, we need to partition the data into smaller chunks (300k objects per chunk) for processing.  
We also parse data encoded as Decimal data with DecimalEncoder.

In [0]:
%mkdir data

In [0]:
import ijson
import json
import decimal

class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return str(o)
        return super(DecimalEncoder, self).default(o)

data_dir = 'data/'
with open('dblpv13_clean.json', 'r') as f:
    counter, file_id = 0, 0
    file_buffer = []
    for obj_data in ijson.items(f, 'item'):
        file_buffer.append(obj_data)
        counter += 1
        if counter % 300000 == 0:
            print(f" Saving, data_PART_{file_id}.json in {data_dir}")
            f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
            dump = json.dumps(file_buffer, cls=DecimalEncoder)
            f.write(dump)
            f.close()
            file_id += 1
            file_buffer = []
f = open(f'{data_dir}data_PART_{file_id}.json', 'w')
dump = json.dumps(file_buffer, cls=DecimalEncoder)
print(f" Saving, data_PART_{file_id}.json in {data_dir}")
f.write(dump)
f.close()
file_id += 1
file_buffer = []

#### b4. Moving files to dbfs FileStore from instance storage. Building Checkpoint.

In [0]:
# removing old json stored in filestore.
dbutils.fs.rm("dbfs:/FileStore/data/split_data/", recurse = True)
# Creating dir to store json in filestore..
dbutils.fs.mkdirs("dbfs:/FileStore/data/split_data")
# confirming dir is empty
dbutils.fs.ls("dbfs:/FileStore/data/split_data")

In [0]:
# Copying all json parts into filestore.
dbutils.fs.cp("file:/databricks/driver/data/", "dbfs:/FileStore/data/split_data", recurse = True)

## Task2. NLP Unsupervised Learning

#### Goal: Read data from databricks Filestore into dataframes (Checkpoint after data load)

In [0]:
dbutils.fs.ls("dbfs:/delta/tables/partial/")

Out[1]: [FileInfo(path='dbfs:/delta/tables/partial/Author/', name='Author/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/FactTable/', name='FactTable/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/FieldOfStudy/', name='FieldOfStudy/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/Language/', name='Language/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/Organization/', name='Organization/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/Publication/', name='Publication/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/Venue/', name='Venue/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/delta/tables/partial/graphCheckPoint/', name='graphCheckPoint/', size=0, modificationTime=0)]

In [0]:
import uuid
from functools import reduce
from pyspark.sql import Row
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType, IntegerType
from typing import List
from pyspark.sql.functions import udf

# Here Path indicates input file path, and delta_dir points to file
path = "dbfs:/FileStore/data/split_data/"
delta_dir = "dbfs:/delta/tables/"

# There should be 18 files each with 300 k records. This would change if you change split value.
file_count = len(dbutils.fs.ls(path))
assert file_count == 18, "Data not found. You may want to check the path or run the notebook from start again. If you updated the split value, ignore this assertion error"

In [0]:
# Build map of spark dataframes by reading json partition chunk files
# dataframes_map = map(lambda r: spark.read.option("inferSchema", True).json(r), [f"{path}data_PART_{num}.json" for num in range(file_count)])
# reduce the dataframes into single dataframe by performing union over the mapped frames.
# union = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dataframes_map)

# Reading first chunk for Testing
union = spark.read.option("inferSchema", True).json(f"{path}data_PART_0.json")

# jsonSchema = StructType([
# 		StructField("_id", StringType(), True),
# 		StructField("abstract", StringType(), True),
# 		StructField("authors", ArrayType(StructType([
# 			StructField("_id", StringType(), True),
# 			StructField("bio", StringType(), True),
# 			StructField("email", StringType(), True),
# 			StructField("gid", StringType(), True),
# 			StructField("name", StringType(), True),
# 			StructField("name_zh", StringType(), True),
# 			StructField("oid", StringType(), True),
# 			StructField("oid_zh", StringType(), True),
# 			StructField("orcid", StringType(), True),
# 			StructField("org", StringType(), True),
# 			StructField("org_zh", StringType(), True),
# 			StructField("orgid", StringType(), True),
# 			StructField("orgs", ArrayType(StringType(), True), True),
# 			StructField("orgs_zh", ArrayType(StringType(), True), True),
# 			StructField("sid", StringType(), True)
# 		]), True), True),
# 		StructField("doi", StringType(), True),
# 		StructField("fos", ArrayType(StringType(), True), True),
# 		StructField("isbn", StringType(), True),
# 		StructField("issn", StringType(), True),
# 		StructField("issue", StringType(), True),
# 		StructField("keywords", ArrayType(StringType(), True), True),
# 		StructField("lang", StringType(), True),
# 		StructField("n_citation", LongType(), True),
# 		StructField("page_end", StringType(), True),
# 		StructField("page_start", StringType(), True),
# 		StructField("pdf", StringType(), True),
# 		StructField("references", ArrayType(StringType(), True), True),
# 		StructField("title", StringType(), True),
# 		StructField("url", ArrayType(StringType(), True), True),
# 		StructField("venue", StructType([
# 			StructField("_id", StringType(), True),
# 			StructField("issn", StringType(), True),
# 			StructField("name", StringType(), True),
# 			StructField("name_d", StringType(), True),
# 			StructField("name_s", StringType(), True),
# 			StructField("online_issn", StringType(), True),
# 			StructField("publisher", StringType(), True),
# 			StructField("raw", StringType(), True),
# 			StructField("raw_zh", StringType(), True),
# 			StructField("sid", StringType(), True),
# 			StructField("src", StringType(), True),
# 			StructField("t", StringType(), True),
# 			StructField("type", LongType(), True)
# 		]), True),
# 		StructField("volume", StringType(), True),
# 		StructField("year", LongType(), True)
# 	])


# union = spark.readStream.schema(jsonSchema).option("maxFilesPerTrigger", 1).json(path)

union = union.na.drop(subset=["authors"])
union = union.dropDuplicates(["_id"])
union = union.filter(union.lang == 'en')
union = union.na.drop(subset=["title", "abstract"])
union.printSchema()

root
 |-- _id: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: string (nulla

In [0]:
# union = spark.read.format('delta').load('dbfs:/delta/tables/partial/Publication/')
# union = union.dropDuplicates(["_id"])
union.show()

+--------------------+--------------------+--------------------+--------------------+----+-------------+--------------------+-----+--------------------+----+----------+------------+------------+--------------------+----------+-------------+--------------------+--------------------+-------------+----+
|                 _id|            abstract|             authors|                 doi| fos|         isbn|                issn|issue|            keywords|lang|n_citation|    page_end|  page_start|                 pdf|references|        title|                 url|               venue|       volume|year|
+--------------------+--------------------+--------------------+--------------------+----+-------------+--------------------+-----+--------------------+----+----------+------------+------------+--------------------+----------+-------------+--------------------+--------------------+-------------+----+
|53e99784b7602d970...|360° represents t...|[{53f46946dabfaec...|10.1145/1665137.1...|null|    

#### Cleaning bad records (empty author lists, small titles)

In [0]:
# Deleting entries with small Titles (less than 3 words) and empty author list
text_size_ = udf(lambda s: len(s.split()), IntegerType())
arr_len_ = udf(lambda s: len(s), IntegerType())
union = union.filter(text_size_(F.col("Title")) > 3)

In [0]:
# TODO; exploratory data analysis

### Data preprocessing

##### https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79

In [0]:
!pip install spacy unidecode contractions nltk tqdm -q
!spacy download en_core_web_md

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[?25l[K     |                                | 10 kB 21.5 MB/s eta 0:00:02[K     |                                | 20 kB 20.8 MB/s eta 0:00:02[K     |                                | 30 kB 10.5 MB/s eta 0:00:04[K     |                                | 40 kB 6.4 MB/s eta 0:00:06[K     |                                | 51 kB 7.3 MB/s eta 0:00:05[K     |                                | 61 kB 6.8 MB/s eta 0:00:05[K     |                                | 71 kB 7.8 MB/s eta 0:00:05[K     |                                | 81 kB 7.3 MB/s eta 0:00:05[K     |                                | 92 kB 6.3 MB/s eta 0:00:06[K     |                                | 102 kB 7.0 MB/s eta 0:00:

In [0]:
custom_stopwords = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure','rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www', 'i.e.', 'copyrights', 'peers', 'authors', 's', 'index', 'table', 'present', 'paper', 'papers']

In [0]:
import re
import spacy
import nltk
from tqdm import tqdm
import unidecode
import contractions
from nltk.stem import WordNetLemmatizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.clustering import KMeans

en = spacy.load('en_core_web_md')
sw_spacy = set(en.Defaults.stop_words)

nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

sw_spacy = list(sw_spacy) + custom_stopwords

numIterations = 200

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [0]:
def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text

def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

def preprocess_udf():
    def preprocess(title, abstract):
        tokens = []
        # cleaning symbols
        if title:
            title = re.sub(r"[^A-Za-z]+", " ", title)
            title = remove_accented_chars(title)
            title = expand_contractions(title)
            tokens = [lemmatizer.lemmatize(word.lower()) for word in title.split() if word.lower() not in sw_spacy]
        if abstract:
            # cleaning symbols
            abstract = re.sub(r"[^A-Za-z ]+", "", abstract)
            # removing accented characters (required?)
            abstract = remove_accented_chars(abstract)
            abstract = expand_contractions(abstract)
            tokens += [lemmatizer.lemmatize(word.lower()) for word in abstract.split() if word.lower() not in sw_spacy]
        return tokens
    return udf(preprocess, ArrayType(StringType()))

In [0]:
union = union.withColumn("tokens", preprocess_udf()("abstract", "title"))

documents = union.select("_id", "tokens").filter(arr_len_(F.col("tokens")) > 0)

In [0]:
display(documents)

_id,tokens
53e99792b7602d9701f5b114,"List(positive, feedback, pilot, system, second, loop, control, feedforward, power, amplifier)"
53e99792b7602d9701f5b191,"List(human, sensibility, ergonomics, hse, apprehend, human, sensitivity, feature, measuring, human, sens, developing, table, related, psychology, physiology, main, purpose, hse, developing, human, centered, good, environment, relevant, technology, improved, life, quality, order, achieve, goal, test, bed, simulator, useful, tool, controlling, monitoring, physical, environment, deal, requirement, design, concept, specification, computing, environment, hse, hse, technology, development, program, sponsored, korean, ministry, science, technology, integrated, computing, system, composed, real, time, non, real, time, environment, non, real, time, development, environment, comprises, pc, window, nt, graphical, user, interface, coded, microsoft, visual, c, pc, independently, control, monitor, thermal, light, audio, video, environment, software, database, developed, non, real, time, environment, directly, ported, real, time, environment, local, area, network, real, time, computing, system, based, cpci, bus, control, integrated, hse, environment, collect, necessary, information, cpci, computing, system, composed, pentium, cpu, board, dedicated, o, board, quantity, determined, expandability, considered, integrated, computing, environment, hse, simulator, guarantee, real, time, capability, stability, expandability, hardware, maximize, portability, compatibility, maintainability, software, requirement, analysis, design, computational, environment, hse, humansensibility, ergonomics, simulator)"
53e99792b7602d9701f5b2d3,"List(present, novel, ia, instrumentation, amplifier, design, implantable, biomedical, device, system, db, cmrr, common, mode, rejection, ratio, proposed, ia, composed, stage, including, preamplifier, nd, order, bpf, band, pas, filer, dc, level, shifter, output, buffer, stage, low, noise, gm, c, amplifier, preamplifier, stage, reduce, coupled, thermal, noise, overwhelm, weak, neural, signal, bpf, designed, based, ota, operational, transconductance, amplifier, dual, current, switch, aiming, low, power, low, noise, demand, source, follower, employed, carry, dc, level, shifter, output, buffer, provides, output, signal, adequate, drive, following, stage, usually, adc, analog, digital, converter, detailed, analysis, proposed, circuitry, derived, solidify, proposed, architecture, proposed, design, implemented, tsmc, mum, p, m, cmos, process, result, post, layout, simulation, verify, performance, design, cmrr, better, db, important, input, noise, rms, merely, db, pvt, process, supply, voltage, temperature, corner, db, cmrr, lownoise, instrumentation, amplifier, neural, signal, sensing)"
53e99792b7602d9701f5b391,"List(prototyping, development, environment, graphical, interactive, telematic, application)"
53e99792b7602d9701f5b3e5,"List(augmented, reality, technology, overlay, virtual, image, information, generated, computer, real, scene, technology, combine, virtual, object, real, world, put, forward, registration, method, multi, marker, augmented, system, paleontology, magic, book, designed, realized, book, special, virtual, education, interactive, real, time, end, result, showed, study, prosperous, future, augmented, reality, applying, education, seen, method, multiplemarker, register, application, virtual, education)"
53e99792b7602d9701f5b3f3,"List(abstract, industrial, research, project, demonstrated, feasibility, applying, category, theoretic, method, specification, synthesis, maintenance, industrial, strength, software, system, demonstration, kind, tool, purpose, kestrel, specware, software, development, system, describe, experience, discus, broadening, application, category, theoretic, method, industry, technology, promising, need, additional, development, generally, usable, surprising, given, mathematical, foundation, hand, believe, demonstration, turning, point, use, mathematically, rigorous, approach, industrial, software, development, maintenance, demonstrated, capture, mathematical, method, software, engineering, design, rationale, product, design, manufacturing, process, rationale, different, engineering, discipline, production, usable, software, directly, captured, rationale, feel, evolution, tool, technology, formal, system, engineering, reality, invited, talk, applying, category, theory, derive, engineering, software, encoded, knowledge)"
53e99796b7602d9701f5be93,"List(address, problem, finding, set, contour, curve, image, consider, problem, perceptual, grouping, contour, completion, data, set, point, image, new, method, find, complete, curve, set, contour, edge, point, presented, approach, based, previous, work, finding, contour, minimal, path, end, point, fast, marching, algorithm, l, d, cohen, r, kimmel, international, journal, computer, vision, vol, pp, given, set, key, point, find, pair, point, linked, path, join, use, saddle, point, minimal, action, map, path, obtained, backpropagation, saddle, point, point, pair, second, propose, scheme, need, key, point, initialization, set, key, point, automatically, selected, larger, set, admissible, point, time, saddle, point, pair, key, point, extracted, path, drawn, image, minimal, path, selected, pair, point, set, minimal, path, completes, initial, set, contour, allows, close, illustrate, capability, approach, close, contour, example, image, set, edge, point, shape, missing, contour, multiple, contour, finding, perceptual, grouping, minimal, path)"
53e99796b7602d9701f5c157,"List(describes, new, core, multi, resolution, data, structure, real, time, visualization, interactive, editing, externally, efficient, processing, large, point, cloud, describe, editing, system, make, use, novel, data, structure, provide, interactive, editing, preprocessing, tool, large, scanner, data, set, new, data, structure, provide, complete, tool, chain, d, scanner, data, processing, data, preprocessing, filtering, manual, touch, real, time, visualization, particular, describe, core, outlier, removal, bilateral, geometry, filtering, algorithm, toolset, interactive, selection, painting, transformation, filtering, huge, core, point, cloud, data, set, real, time, rendering, algorithm, use, data, structure, storage, backend, interactive, tool, work, real, time, small, model, modification, large, scale, editing, operation, employ, resolution, approach, editing, planned, real, time, executed, externally, efficient, offline, computation, evaluate, implementation, example, data, set, size, gb, demonstrating, proposed, technique, effectively, real, world, application, special, section, pointbased, graphic, processing, interactive, editing, huge, point, cloud, d, scanner)"
53e99796b7602d9701f5c193,"List(power, distribution, network, include, distributed, generation, need, new, control, strategy, based, distributed, hierarchical, structure, system, replicator, dynamic, strategy, dynamic, resource, allocation, dispatch, distributed, generator, microgrid, presented, approach, us, characteristic, defined, subsystem, order, offer, simple, algorithm, optimal, feasible, solution, case, method, satisfy, problem, constraint, compare, performance, replicator, dynamic, strategy, analyze, optimality, obtained, solution, market, multiagent, based, scheme, adapted, result, implemented, simulation, model, different, scenario, applicability, proposed, strategy, population, dynamic, approach, dispatch, distributed, generator)"
53e99796b7602d9701f5c1a2,"List(existing, medical, vocabulary, lack, rich, term, describe, finding, generated, modem, molecular, diagnostic, procedure, bioinformatics, resource, designed, primarily, support, need, research, community, describe, development, curated, resource, clinical, bioinformatics, ontology, cbo, semantic, network, appropriate, describing, clinically, significant, genomics, concept, cbo, includes, concept, appropriate, molecular, diagnostics, cytogenetics, standardized, methodology, based, consistent, application, refseq, information, applied, curation, cbo, order, provide, reproducible, reliable, tool, challenge, related, curation, process, discussed, time, submission, cbo, included, concept, associated, relationship, clinical, bioinformatics, ontology, curated, semantic, network, utilizing, refseq, information)"


In [0]:
# word2vec = Word2Vec()
# model = word2vec.fit(documents.rdd.map(lambda x: x.tokens).collect())

word2Vec = Word2Vec(inputCol="tokens", outputCol="features").fit(documents)
documents = word2Vec.transform(documents)

In [0]:
display(documents)

_id,tokens,features
53e99792b7602d9701f5b114,"List(positive, feedback, pilot, system, second, loop, control, feedforward, power, amplifier)","Map(vectorType -> dense, length -> 100, values -> List(0.0010927247814834118, -0.048946963064372545, 0.25271623916924, -0.10146973873488606, 0.0929327979683876, 0.0620644030161202, 0.18247236199676992, -0.008089794171974063, 0.011958000995218755, -3.759678453207016E-4, -0.07504244782030584, -0.1550638496875763, -0.05285673774778843, 0.07350046783685685, 0.15567383375018837, 0.22132079489529133, -0.19899859167635442, -0.12370968582108617, 0.032582087069749834, 0.05979675310663879, 0.14004724696278573, -0.17189322207123042, -0.009475089702755214, -0.005496566742658616, -0.09328386827837676, -0.00999482460319996, -0.039211823139339685, 0.026010511815547945, 0.049516450613737106, 0.0878322945907712, -0.11589562557637692, -0.16261781491339208, -0.07318642595782876, 0.14487319104373456, -0.07714137155562639, 0.05137619879096747, -0.051670830836519605, -0.09527750127017498, -0.017030235566198825, -0.23367526158690455, 0.002070311829447746, -0.0012378905434161426, -0.14453946468420328, -0.0705340939341113, -0.12340338854119182, -0.0851744070649147, 0.011616210453212261, 0.08518979027867318, -0.15146196018904448, 0.0819782167673111, 0.043000865587964655, -0.022688549198210242, -0.02991621606051922, 0.016066246293485166, 0.08565911119803787, 0.13398654172196986, 0.10416028469335288, 0.13412011228501797, 0.11769673302769662, -0.12895780727267267, 0.05083277886733413, -0.08135408144444228, -0.017674032598733902, -0.14100532587617637, 0.1559406191110611, 0.0865834909491241, -0.08153893876587973, -0.018087121285498144, 0.05385218560695648, -0.19007823567371818, -0.0645210187882185, -0.027516840398311617, -0.19985381439328195, -0.0546268779784441, -0.07770143551751972, 0.023293661139905456, 0.04728543199598789, 0.07786396192386746, 0.07273276895284653, -0.062126229703426364, 0.11661941409111024, 0.07477662162855268, 0.04439299148507417, 0.07757203783839942, -0.01541152037680149, -0.010049501061439516, 0.016276248078793288, -0.3513743249815889, 0.20911250058561565, 0.09275583177804947, -0.25134466248564424, -0.01449076607823372, -0.09569526314735413, -0.18853178117424252, 0.040441786218434576, 0.18679111078381538, 0.07140682929893956, -0.16679169202689081, 0.06341372523456812, -0.11312212012708188))"
53e99792b7602d9701f5b191,"List(human, sensibility, ergonomics, hse, apprehend, human, sensitivity, feature, measuring, human, sens, developing, table, related, psychology, physiology, main, purpose, hse, developing, human, centered, good, environment, relevant, technology, improved, life, quality, order, achieve, goal, test, bed, simulator, useful, tool, controlling, monitoring, physical, environment, deal, requirement, design, concept, specification, computing, environment, hse, hse, technology, development, program, sponsored, korean, ministry, science, technology, integrated, computing, system, composed, real, time, non, real, time, environment, non, real, time, development, environment, comprises, pc, window, nt, graphical, user, interface, coded, microsoft, visual, c, pc, independently, control, monitor, thermal, light, audio, video, environment, software, database, developed, non, real, time, environment, directly, ported, real, time, environment, local, area, network, real, time, computing, system, based, cpci, bus, control, integrated, hse, environment, collect, necessary, information, cpci, computing, system, composed, pentium, cpu, board, dedicated, o, board, quantity, determined, expandability, considered, integrated, computing, environment, hse, simulator, guarantee, real, time, capability, stability, expandability, hardware, maximize, portability, compatibility, maintainability, software, requirement, analysis, design, computational, environment, hse, humansensibility, ergonomics, simulator)","Map(vectorType -> dense, length -> 100, values -> List(0.060469724318403514, 0.012265218924845993, 0.05002128224773907, 0.028776487841466324, 0.02628694784095982, 0.02407731276212467, -0.02292923959630346, -0.0398160745087942, -0.0016345079585099247, 0.052771995373337564, 0.0022296798415482044, 0.06216848775583469, 0.05932464655258773, -0.016588177577948865, 0.024572606911353857, -0.014395827420320315, -0.05369596241076511, -0.022936926729959884, 0.03874633242347404, 0.08224202271042115, 0.009770581776486065, 0.0028554206817514366, -0.028808185119644655, 0.015287506377560166, -0.017858647323863143, -0.062418228296999945, 0.034570706669057226, 0.011471077873342617, 0.006491302756914569, 0.004524683959948842, -0.01508282520233389, -0.051069639771199424, -0.0028255281044331226, 0.03226826534021565, -0.08067346698271087, -0.06043699758420525, 0.09559401165964196, -0.04609761058559848, 0.009421452205128002, -0.025258083676884847, -0.04994807315610901, -0.0010521742120028737, 0.0076211095197603424, -0.06129672111983606, -0.08282889594197457, 0.040626291761226535, -0.0119385408024867, 0.07457874244501897, 0.05334272244699941, -0.06647713057535849, -0.024419172063960697, -0.056956264998096924, 0.08414075793076829, 0.03679809179501953, -0.059707276801935126, 0.02892202139799023, 0.028864601777900973, 0.003777557794280626, 0.0071645446170558345, -0.09804574770200422, 0.02308187124184275, 0.05651724142493841, -0.09955058938731658, -0.09055241092070822, 0.12999281147494912, -0.013875346838490876, -0.022932946934928127, -0.03404275230692561, 0.026045733526705497, 0.009926900425695894, 0.005273130678059389, -0.014133681856486715, 0.03230528546609329, -0.04935929287270999, 0.01798620200888426, 0.03675210596217463, -0.016758929899272818, 0.07105729419071181, -0.001614184747144643, -0.08612251206140183, 0.025220982345212018, 0.035980279978514176, 0.040421790170287825, 0.07088525654362905, 0.05583394430922689, 0.04498456727755924, 0.02361187388844512, 0.039097530361305186, -0.024643555081616166, 0.020017441035173604, 7.571454453109591E-4, 0.13417523393923428, 0.1717390216955984, -0.1417629664489298, 0.04487327339535832, -0.008986143171097393, 0.05668280047007382, -0.07792751925381584, 0.08152848125843584, -0.06861280182370755))"
53e99792b7602d9701f5b2d3,"List(present, novel, ia, instrumentation, amplifier, design, implantable, biomedical, device, system, db, cmrr, common, mode, rejection, ratio, proposed, ia, composed, stage, including, preamplifier, nd, order, bpf, band, pas, filer, dc, level, shifter, output, buffer, stage, low, noise, gm, c, amplifier, preamplifier, stage, reduce, coupled, thermal, noise, overwhelm, weak, neural, signal, bpf, designed, based, ota, operational, transconductance, amplifier, dual, current, switch, aiming, low, power, low, noise, demand, source, follower, employed, carry, dc, level, shifter, output, buffer, provides, output, signal, adequate, drive, following, stage, usually, adc, analog, digital, converter, detailed, analysis, proposed, circuitry, derived, solidify, proposed, architecture, proposed, design, implemented, tsmc, mum, p, m, cmos, process, result, post, layout, simulation, verify, performance, design, cmrr, better, db, important, input, noise, rms, merely, db, pvt, process, supply, voltage, temperature, corner, db, cmrr, lownoise, instrumentation, amplifier, neural, signal, sensing)","Map(vectorType -> dense, length -> 100, values -> List(-0.006423055098034945, -0.022796694318854942, 0.24569532859202164, -0.07244183677616284, 0.06823541614037931, 0.012829776419391434, 0.17153565631829257, 0.10230580898688027, 0.01588294462167791, 0.09618347528726702, -0.04787358434494896, -0.026209623983507988, 0.00732483637442154, 0.0805032578387991, 0.019777637644656898, 0.1151656073586371, -0.09428208402210944, -0.032660354320940214, -0.04084619025246168, 0.027445028292687426, 0.1726229181775781, -0.07589726596265414, 0.004150662123036564, -0.04060111875834881, -0.10207095300285075, -0.08320843441415354, -0.022125415419949623, -0.052951417337311776, -0.056476103901667025, 0.04844972785541269, -0.04484483222757235, -0.09819031718950298, -0.030005888111409695, 0.0281547055548211, -0.006472882655400194, -0.009949051736524902, 0.016914316080160495, 0.03380732302364886, -0.09828578367110698, -0.10568350647588516, 0.021461584528249904, -0.05960251406536269, -0.03278529176600557, 0.0419092186060524, 0.025522810886649035, -0.02512743578141877, 0.011868712312850008, 0.08063850810101844, -0.1395326196846265, 0.05116450346663202, 0.05386773817905547, 0.011461730675309673, -0.02351040121961579, -0.012618845965034027, 0.03260002226421707, 0.07158066209797796, 0.1286435021865234, 0.10657573614402541, 0.1724078462658716, -0.043077435115128196, 0.057043661065600965, 0.03733284127044218, 0.011249919520004798, -0.020349592378335002, 0.10361390315911809, 0.11922341655533676, -0.012474387334449622, -0.030753286578167314, 0.03137183671071425, -0.06764251943432578, -0.08062731565319393, 0.022018421917481528, -0.06549492384691344, -0.07148709947145299, -0.055347233587049675, 0.05049868242612066, -0.009778640128381522, 0.0954454800681232, 0.023306643131672213, -0.01972080141663047, 0.01672592060027742, 0.04951050201486936, 0.09944043629732739, 0.07895909794921377, 0.021293599977809233, 0.0509718082134246, -0.004938554126398175, -0.13089106564980726, 0.010830188373145751, 0.1031686409105874, -0.0859378996739318, 0.11634544403768403, -0.06485427285306491, -0.21585747354866977, 0.08903912289498751, 0.06768813893405538, 0.09437756670863871, -0.03877841274751196, 0.09058875840899527, 0.05841750914692227))"
53e99792b7602d9701f5b391,"List(prototyping, development, environment, graphical, interactive, telematic, application)","Map(vectorType -> dense, length -> 100, values -> List(0.20943157467991114, -0.03674944796200309, -0.023596685379743576, 0.010848087125590869, -0.12066685434963022, 0.06343371421098709, 0.020533149662826742, -0.1613763328641653, 0.03162163254039894, 0.08615139552525111, -0.021535647234746387, 0.09831319962229047, 0.07388124189206531, -0.012025927060416766, -0.17011748520391326, 0.10763745275991303, -0.21020700064088615, -0.1449795306793281, 0.15657994310770715, 0.1308358194572585, 0.015317566692829132, 0.020373945581793253, 0.014882583171129227, -0.0013458324330193655, 0.07744509779981204, -0.0937221460044384, 0.06812340819409915, 0.008071533271244593, -0.08002695467855249, 0.009783254138060977, -0.12577217923743383, -0.05671112611889839, 0.06424740768436875, 0.07994059420057704, -0.17162986206156866, -0.26970437701259337, 0.2270297056862286, 0.040406089009983194, 0.04851978511682578, -0.015359959432056971, 0.07080636266618967, -0.13507348405463354, -0.06786022608035376, -0.038749188650399446, -0.18834713447306836, -0.09892801840656569, -0.18717711245907204, 0.1777722348592111, 0.251859045454434, -0.1325211099215916, -0.18551279923745562, -0.19201838863747459, 0.19486084048237118, 0.01720610154526574, -0.12034376378038099, -0.02611840037362916, 0.03139085841498204, -0.05133372997598988, -0.079067234482084, -0.1383159075464521, 0.07857661374977656, 0.20406030437776018, -0.19788543454238344, -0.1141298623489482, 0.3074831671214529, -0.154635237795966, -0.04206468803542001, -0.013013225314872605, -0.015418569557368755, 0.10949414396392447, 0.01713070087134838, -0.010235424286552837, 0.1305580958724022, 0.09495478575783116, 0.06740668030189616, -0.029202542334262813, 0.1192163691989013, 0.07998476057712521, 0.011383269528908255, -0.25288584854985985, 0.07681512806032385, 0.0012451689690351486, 0.11734674046082155, 0.03385271544435194, 0.11527094085301672, 0.13790507641221794, -0.039358525936092646, 0.1723091189882585, 0.005141409646187509, 0.002770177088677883, 0.12369331132088388, -0.0010070066366876873, 0.18073567787983585, -0.1599334765757833, 0.04931722288685185, -0.03510428299861295, 0.006887388548680714, -0.11294892829443727, 0.09474585571193268, -0.05103820439295045))"
53e99792b7602d9701f5b3e5,"List(augmented, reality, technology, overlay, virtual, image, information, generated, computer, real, scene, technology, combine, virtual, object, real, world, put, forward, registration, method, multi, marker, augmented, system, paleontology, magic, book, designed, realized, book, special, virtual, education, interactive, real, time, end, result, showed, study, prosperous, future, augmented, reality, applying, education, seen, method, multiplemarker, register, application, virtual, education)","Map(vectorType -> dense, length -> 100, values -> List(0.11641680176318105, 0.05755183164082053, 0.09729976831348958, 0.03523605828673613, 0.015688262474550692, 0.0792203733732027, -0.07490053041665642, -0.12423626081241915, 0.01700567918036271, 0.08437190147082287, 0.018162144449349767, 0.014126064462794198, 0.02498670037904823, -0.0038242576648136253, 0.019855015100566323, -0.06747303532298515, -0.06761286364592335, -0.06601572477396715, 0.1305097748991102, 0.06118441672771479, 0.009569806262367853, -0.04863067290366248, -0.024901338257930342, 0.02583061252651667, -0.06431867745360015, -0.020741819119494822, 0.07670988001067329, 0.08428343034487355, -0.09732125607250189, -0.018044237917315006, 0.031346808373065735, 0.011225882395929483, -0.013654095681246231, 0.0395046881986437, -0.07736885601102754, -0.08661432766045132, 0.11156605801726173, 0.049162987880494995, 0.02116763190780249, 0.0825036248166321, -0.15737209355251655, 0.004216488536999181, 0.05126983327877328, -0.12368142314651911, -0.051765726068643506, -0.03550168851399104, -0.05272172107921775, 0.11781638647921176, 0.09156047530618876, -0.10110601979572997, -0.041985443620770065, -0.1225056825938669, 0.0655942385999003, 0.02296471241775348, -0.045202492154203355, 0.15499655164226336, 0.05545656300253338, -0.004428701169133462, -0.070023522246629, -0.030882383394917404, -0.00883605967586239, 0.11364883053357953, -0.17474552808777877, 0.005406219779548269, 0.11355910567497765, -0.04875171532492256, -0.006890821274956343, -0.03938415151572337, -0.01183313065363715, 0.1047238223987666, 0.043808461904215314, -0.03587354165588126, -0.05981779219237742, -0.0158500458907198, 0.031587020249356064, 0.018320190168365284, -0.09256502264799092, 0.06267832840482393, 0.07337612090459852, -0.20164292592003388, 0.0032410537134166116, -0.02153350127412489, 0.0421475690570703, -0.049827700452792834, 0.0946640493385968, 0.05585947610368883, -0.011809290041802105, 0.026441385682792987, 0.025872800059409604, 0.019049692709274865, 0.02774626298385017, 0.07259273131515968, 0.1075912888282565, -0.1180005535500607, 0.05575328336649195, -0.1035772086923114, 0.014016481978749787, -0.04831127734440896, 0.12580829057670026, -0.03811332743282913))"
53e99792b7602d9701f5b3f3,"List(abstract, industrial, research, project, demonstrated, feasibility, applying, category, theoretic, method, specification, synthesis, maintenance, industrial, strength, software, system, demonstration, kind, tool, purpose, kestrel, specware, software, development, system, describe, experience, discus, broadening, application, category, theoretic, method, industry, technology, promising, need, additional, development, generally, usable, surprising, given, mathematical, foundation, hand, believe, demonstration, turning, point, use, mathematically, rigorous, approach, industrial, software, development, maintenance, demonstrated, capture, mathematical, method, software, engineering, design, rationale, product, design, manufacturing, process, rationale, different, engineering, discipline, production, usable, software, directly, captured, rationale, feel, evolution, tool, technology, formal, system, engineering, reality, invited, talk, applying, category, theory, derive, engineering, software, encoded, knowledge)","Map(vectorType -> dense, length -> 100, values -> List(0.0834862913917562, 0.014189588018890585, 0.026201695688257014, 0.15795752048172584, 0.033869223005956774, 0.05929844872438058, 0.02292684689540454, -0.09614085049528366, 0.003665019945603692, 0.06456452840410004, -0.07965551637527016, -0.025758834358191852, 0.0892971157933576, -0.05865025628155869, -0.08176505548239808, 0.06552716319077659, -0.0529902631374584, 0.004799165423122244, 0.0410102773171137, 0.08757044180211697, 0.0337264341760234, 0.08111001735534332, -0.06535882389612703, 0.02454138907422829, 0.0579074450881418, -0.11237666771436731, 0.07335574581344245, -0.10279356618176658, -0.012504326109068865, -0.03261773131684591, -0.08888586854087095, 0.027161524127792533, -0.031261731344839616, -0.06297557617565899, -0.09100588159447517, -0.06003813647354643, 0.06379394968135244, -0.08700358674619253, -0.11016607776077258, -0.05393275343154492, -0.09123165127492013, -0.07806757641862137, 0.05513938271435862, -0.023388036997104535, -0.11575150768263172, -0.0012073131807905746, -0.0665433050428442, 0.06269463988942932, 0.010770490974163745, -0.06750144001162339, -0.0503821713624127, -0.025304583611550056, 0.08710736824161928, 0.020361050862273335, -0.11816993529786066, 0.016117329354106327, -0.019533140138567738, -0.07477443953129378, -0.09237331914247457, -0.07769711689571991, 0.081527724509325, 0.06248771851280272, -0.05216274690823724, -0.06787355468994138, 0.19913044129263122, -0.0668706887823793, -0.02672531291597871, -0.09609580296091737, 0.08650069546443646, 0.08073702898121092, 0.02810803795706086, 0.011132715439254589, -0.005419469347360046, 0.04535778162817499, 0.032453795415208195, -0.01658954540493362, 0.02442704944523296, 0.078898816851598, 0.11334151641766758, -0.15956929717401064, -0.006856995230220785, -0.005576174979032291, 3.5492154461510406E-4, 0.10782868480497991, -0.05437123021252002, 0.060712540999638166, -0.06074544721673422, 0.08215072453774588, -0.052131859102342845, -0.01799014276912203, 0.11317082352829261, 0.027194769663802076, 0.0997995647432452, -0.1414517178717587, 0.017387475319808783, -0.09059981118906478, 0.011521829572633249, -0.17570341935127298, 0.05639631757418616, -0.06957950983949081))"
53e99796b7602d9701f5be93,"List(address, problem, finding, set, contour, curve, image, consider, problem, perceptual, grouping, contour, completion, data, set, point, image, new, method, find, complete, curve, set, contour, edge, point, presented, approach, based, previous, work, finding, contour, minimal, path, end, point, fast, marching, algorithm, l, d, cohen, r, kimmel, international, journal, computer, vision, vol, pp, given, set, key, point, find, pair, point, linked, path, join, use, saddle, point, minimal, action, map, path, obtained, backpropagation, saddle, point, point, pair, second, propose, scheme, need, key, point, initialization, set, key, point, automatically, selected, larger, set, admissible, point, time, saddle, point, pair, key, point, extracted, path, drawn, image, minimal, path, selected, pair, point, set, minimal, path, completes, initial, set, contour, allows, close, illustrate, capability, approach, close, contour, example, image, set, edge, point, shape, missing, contour, multiple, contour, finding, perceptual, grouping, minimal, path)","Map(vectorType -> dense, length -> 100, values -> List(0.08281155804119163, 0.03710953948616203, -0.05456733573992639, 0.18066877546831409, 0.07845861331849453, -0.001511186286598555, 0.0207138542127587, 0.09650017890597345, -0.10815517867067412, -0.08339314814309465, 0.0900534553359499, 0.038954655862232644, -0.0996982243625315, 0.0796716198769968, 0.013719462393435524, -0.1269692162776352, 0.09380513745700635, 0.024466574516967494, 0.023069896388611295, 0.19847735839916755, 0.12528314207981986, -0.08467636861205935, -0.005901578052747828, 0.09759401372302004, -0.031303130781778425, 0.06100647423048017, 0.01197502512568191, 0.005535411040529734, -0.10363606742145355, -0.0243366043754061, 0.08712006889547763, -0.15093828204744703, -0.016440874722320586, -0.06083159880198316, -0.11428813537610555, 0.05068496610524494, -0.05788024376778961, 0.04808284552314374, 0.049763233703213616, -0.010339620884563496, -0.01797517000197141, -0.025523697272804912, 0.08422142587369308, -0.02289781507466044, 0.15794916783890395, -0.08585224979871592, 0.016932227080183992, 0.04253825272516862, 7.07019579165907E-4, 0.016623180013831094, 0.14035456636642565, -0.020454378850730273, -0.03916177072965387, -0.02624504148365874, -0.016817733211053617, 0.016024821327561372, 0.014577670905516663, -0.039030434282274165, -0.02219252059333471, 0.0032877555506220504, -0.1311716430424365, 0.016924915744909154, 0.05273907114543132, -0.04901741532636667, -0.01660123506453194, 0.07920513937333182, -0.07247618628481625, -0.061366361843894664, 0.0039111665476058936, 0.1395398312102336, 0.08538666637371113, 1.4085531123538515E-4, -0.0831767194904387, -0.10366008657468621, -0.006846467283234668, 0.02835850412053848, -0.028003890083845594, 0.0854057161234764, -0.008991422420523977, -0.10147240856287083, 0.04058226428182323, -0.014150232027993717, -0.09684610135554433, 0.0027280322614007756, -0.04199116218682212, -0.09109422900547176, -0.024796823136953274, -0.04030965207797934, -0.023480662179471395, 0.05838759316812589, 0.016628813231363893, 0.011563924740680229, -0.05011589014416199, -0.0787095581911115, -0.08951686488798083, -0.03119820510141496, 0.0026641431361881652, -0.06455566520184111, 0.0142795378637653, -0.05913000019465281))"
53e99796b7602d9701f5c157,"List(describes, new, core, multi, resolution, data, structure, real, time, visualization, interactive, editing, externally, efficient, processing, large, point, cloud, describe, editing, system, make, use, novel, data, structure, provide, interactive, editing, preprocessing, tool, large, scanner, data, set, new, data, structure, provide, complete, tool, chain, d, scanner, data, processing, data, preprocessing, filtering, manual, touch, real, time, visualization, particular, describe, core, outlier, removal, bilateral, geometry, filtering, algorithm, toolset, interactive, selection, painting, transformation, filtering, huge, core, point, cloud, data, set, real, time, rendering, algorithm, use, data, structure, storage, backend, interactive, tool, work, real, time, small, model, modification, large, scale, editing, operation, employ, resolution, approach, editing, planned, real, time, executed, externally, efficient, offline, computation, evaluate, implementation, example, data, set, size, gb, demonstrating, proposed, technique, effectively, real, world, application, special, section, pointbased, graphic, processing, interactive, editing, huge, point, cloud, d, scanner)","Map(vectorType -> dense, length -> 100, values -> List(0.08490747415613549, -0.010236758203593207, 0.0535923481776393, -0.030567652321726417, 0.01494058554328773, 0.017622724354420024, -0.056776569033764414, -0.0016320991928649219, -0.020897953687652723, 0.04685180667620986, -0.004543513730307346, 0.055625217921559264, -0.0856234133174059, 0.03887463058605651, 0.03347458425885824, -0.018111120506342667, 0.05551424438008733, -0.007105291207710198, 0.08752237455988887, 0.1362245205525698, 0.09669472217045502, -0.04221200169005028, -0.03679170261081586, 0.05302463166690223, -0.01979473641838569, 0.005415849180009438, 0.07480507207610436, -0.02357175887713066, -0.01059853814024506, -0.064860856139671, 0.027219877229072154, -0.07335647154050365, -0.015803200665360957, 0.03713558566471589, -0.14727674999636264, -0.13399853047205887, 0.09213880326862989, -0.005365119767444792, 0.06392277792485347, 0.0017476204257986662, -0.05249328907481881, -0.043334912661629824, 0.07434680194197807, -0.08773213938580575, 0.057486800568998415, -0.09296393032713945, 0.047674104192558284, 0.05447977443213519, 0.07487069626439558, -0.022078444675625815, -0.0059002028698268445, -0.10074540214098768, 0.04642627003547082, -0.01756846574274128, -0.0641290892832982, -0.04169132204642936, 0.024401043324536686, -0.008973094369302183, -0.033855539596285116, -0.08574084217187518, 0.07041864759903818, 0.15116284916232994, -0.08030022121270868, -0.09951638457115128, 0.06927322517079648, -0.005711062107610502, -0.022186015759518285, -0.05984539800121991, 0.040221620560667144, 0.08452116336096634, 0.031007231200989614, 0.0023166957557194433, -5.573842938600192E-4, -0.06521145608832142, 0.02824495341620102, 0.020296053827128638, 0.027347528285333024, 0.02366626604960592, -0.011766141478611684, -0.10594766964760623, 0.09595557805602509, -0.023075809288742158, 0.052405218517324374, 1.5637085602771656E-4, 0.02425706565380965, 0.05827822450047998, -0.014783320878979874, 0.004881641814492142, -0.10174513221092855, 0.024205107262719478, 0.04812293589712857, 0.04650173378055359, 0.03141660999277356, -0.13135144714467617, 0.013276547130975706, 0.01593722842995133, 0.05875640775862414, 0.042048371730566914, 0.04434699344851855, -0.02484096514422502))"
53e99796b7602d9701f5c193,"List(power, distribution, network, include, distributed, generation, need, new, control, strategy, based, distributed, hierarchical, structure, system, replicator, dynamic, strategy, dynamic, resource, allocation, dispatch, distributed, generator, microgrid, presented, approach, us, characteristic, defined, subsystem, order, offer, simple, algorithm, optimal, feasible, solution, case, method, satisfy, problem, constraint, compare, performance, replicator, dynamic, strategy, analyze, optimality, obtained, solution, market, multiagent, based, scheme, adapted, result, implemented, simulation, model, different, scenario, applicability, proposed, strategy, population, dynamic, approach, dispatch, distributed, generator)","Map(vectorType -> dense, length -> 100, values -> List(0.024770934833213687, 0.0216165802848991, 0.054791179635988854, -0.02226595306476358, 0.08146809333831899, 0.052913552745141916, -0.09072303990898135, 0.007069233817876212, 0.008415289297772688, 0.032055406394647434, 0.006550509915945844, -0.02110789990466502, 0.05542198819522228, 0.0015026030822708788, 0.0789560348526316, -0.040009658891196724, -0.06290871102828532, -0.07875416159125355, -0.028425152777951247, 0.13275325203762298, 0.07562961589347751, -0.09294806450553651, -0.016559747901434697, 0.10299791831898296, 0.006152326564511491, 0.058268537997112915, -0.0023166681470077792, 0.04995213548924463, 0.0712702051581194, -0.008253182203690004, -0.16273805616138917, -0.051343320965922125, 0.033672400370253146, -0.014870593104408018, -0.04637837852998119, 0.05585940109772814, 0.06186693760698997, -0.04271823048798574, -0.050001811857024826, -0.06187535189868261, -0.048707685963664614, -0.012231451088155154, 0.05312147303549055, 0.04044410786526795, -0.004854334874026891, 0.03140186135553651, 0.06295086380931833, 0.0400122051917909, 0.03908033099206578, -0.03573508467525244, -0.0037057840705124866, -0.07023824931174102, 0.006065088608819577, 0.07862846422196727, -0.08788643236847646, 0.003314898051838908, 0.07595533744056916, -0.026552259964066453, 0.051472736496685274, -0.04398290583614855, -0.04417685428618117, 0.026738248972429168, 0.057644830479855746, -0.12727445863290793, 0.14632128587173712, -0.004761815043619006, -0.07671279223480572, -0.07903644727452451, 0.028328617006789295, 0.05519128460501734, -0.07491240124606216, -0.010961726172051081, 0.02746724806881199, -0.030321361839823008, 0.006127959095465485, -0.02113956710996313, 0.027329118379081287, -0.0025982373036741693, 0.018231877914836835, 0.031002232779024374, 0.08737692640473445, 0.07597173844502927, -0.050402445847996406, 0.07309714693433812, 0.00622251891117129, 0.017506536684878584, 0.0508383997496114, -0.09906006897896683, 0.04006875320596413, -0.013311541933540462, 0.057496429194644504, 0.01987523170019914, -0.07211979292333126, -0.12988308594988968, 0.05595044119076596, -0.016462642725350127, -0.0023480415968353753, -0.03834785628391223, 0.035998028444333204, -0.05021463851800136))"
53e99796b7602d9701f5c1a2,"List(existing, medical, vocabulary, lack, rich, term, describe, finding, generated, modem, molecular, diagnostic, procedure, bioinformatics, resource, designed, primarily, support, need, research, community, describe, development, curated, resource, clinical, bioinformatics, ontology, cbo, semantic, network, appropriate, describing, clinically, significant, genomics, concept, cbo, includes, concept, appropriate, molecular, diagnostics, cytogenetics, standardized, methodology, based, consistent, application, refseq, information, applied, curation, cbo, order, provide, reproducible, reliable, tool, challenge, related, curation, process, discussed, time, submission, cbo, included, concept, associated, relationship, clinical, bioinformatics, ontology, curated, semantic, network, utilizing, refseq, information)","Map(vectorType -> dense, length -> 100, values -> List(0.020511164917843418, -0.05388507656462025, -0.021090219833422453, 0.12781627777003451, 0.002228059939807281, 0.11700426095630974, 0.006514009585953318, -0.05104409130581189, 0.027130619451781968, 0.09461630404257448, -0.22652590540092207, 0.022721983364317567, -0.050137396377976984, -0.09336374073900516, -0.007669353423989379, -0.06613850460271352, 0.003754517249763012, 0.00890919074154226, 0.03260013097606134, 0.0371949038701132, -0.013505348707258237, 0.10631686984052068, -0.033079709642333914, 0.04221364697586978, -0.035403903073165564, -0.027217306336387994, 0.038839493616978873, -0.02454478390864097, -0.034495810551015894, -0.028722663025837394, -0.03068526181159541, -0.03838280635663978, 0.006428619009966496, -0.05065827173384605, -0.14671766895335167, -0.030873416348913454, 0.008191733744752128, -0.07234457358135842, 0.05411884000350256, -0.022947175083390906, -0.048464312119176615, -0.03544870220539451, 0.15260799533571118, -0.07382208274357253, -0.08248079518089071, 0.08808226654073224, 0.04107362584327348, 0.015532380307558925, -0.050192483956925577, -1.2056828563800083E-5, -0.007937608312931844, -0.11075013899535407, 0.1158981551823672, -0.09844269143068232, -0.1377766135381535, -0.019494939193828033, -0.03623538256797474, -0.0284094712274964, 0.0069687308154243515, -0.11925892657600344, 0.019500766051351093, 0.09204169153126714, -0.07931844733539038, -0.030011808553172162, 0.07466658634220949, -0.012540578420157544, 0.03493870257370873, -0.04395172323856969, 0.0534270625685167, 0.08196401873865398, -0.02160879155708244, -0.08899172859964893, 0.03861846089130268, 0.007636802725028247, 0.010772213226300665, -0.027794397051911803, 0.09199715774739162, 0.06063898917636834, 2.764079545158893E-5, -0.039457952301017944, 0.025215255294460805, 0.007214406560524367, -0.007376648577337619, 0.06151562281302177, -0.02847607118965243, -0.028093471754255007, -0.11864702686434613, 0.11716438830480912, 0.04586944161419524, -0.02318930846713556, 0.048789071708597476, 0.05013481356145349, 0.09789282415877097, -0.16617262909421698, 0.02085963550489396, -0.06849418674828485, -0.048221289693901784, -0.04931220234138892, 0.07581146073061973, 0.03619858901947737))"


In [0]:
costs = {}

for k in tqdm(range(30, 65, 10)):
    kmeans_model = KMeans(maxIter=numIterations, k=k, seed=100).fit(documents)
    costs[k] = kmeans_model.summary.trainingCost
    print(k, costs[k])

  0%|          | 0/4 [00:00<?, ?it/s] 25%|██▌       | 1/4 [26:24<1:19:12, 1584.05s/it]30 87401.15461510382
 50%|█████     | 2/4 [54:12<54:27, 1633.76s/it]  40 83780.20496213346


In [0]:
costs

In [0]:
import matplotlib.pyplot as plt

plt.plot(list(range(15, 45, 3)), costs)
plt.xlabel("Clusters")
plt.ylabel(TrainingCost)
plt.title("Elbow plot")
plt.show()

In [0]:
# saving best model
import numpy as np
from pyspark.ml.clustering import KMeansModel

best_k = np.argmin(costs)
kmeans_models[best_k].save("dbfs:/FileStore/data/clustering_model")

In [0]:

best_model = KMeansModel.load("dbfs:/FileStore/data/clustering_model")
predictions = best_model.transform(documents)
predictions.show()