In [11]:
from collections import Counter
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
import numpy as np
tqdm.pandas()

df = pd.read_csv("../data/concepts.csv")
df.concepts = df.concepts.progress_apply(lambda x: set(literal_eval(x)))

concepts = Counter()
for concept_list in df.concepts:
    concepts.update(concept_list)

100%|████████████████████████████████████████████████████████████████████████████████| 195370/195370 [00:06<00:00, 31373.37it/s]


In [12]:
concepts.most_common(25)
# IDEA: automatic merging of highly connected nodes
# Can we merge abbreviations with their concepts automatically?
len(concepts)

1384427

In [13]:
CONCEPTS_COLNAME = "concepts"
INPUT_FILE = "../data/concepts.csv"
ORIGIN_DAY = pd.to_datetime("1970-01-01")


class OccuranceFilter:
    def __init__(self, min_occurance=None, max_occurance=None):
        self.min_occurance = min_occurance
        self.max_occurance = max_occurance

    def __call__(self, concepts):
        #print(
        #    f"Applying occurance filter: min={self.min_occurance}, max={self.max_occurance}"
        #)
        concepts = {
            concept: n
            for concept, n in tqdm(concepts.items())
            if (self.min_occurance is None or n >= self.min_occurance)
            and (self.max_occurance is None or n <= self.max_occurance)
        }

        return concepts

c_filters = [OccuranceFilter(min_occurance=3)]


print(f"Number of concepts: {len(concepts)} -> ", end="")
for filter in c_filters:
    concepts_filtered = filter(concepts)
print(len(concepts_filtered))

concept_list = list(concepts_filtered.keys())

# transform concepts into numbers
lookup = {}
for index, concept in enumerate(sorted(concept_list)):
    concept = concept.strip()
    lookup[concept] = index

# save lookup as csv
lookup_df = pd.DataFrame.from_dict(lookup, orient="index", columns=["id"])
lookup_df.to_csv("lookup.csv")

def get_pairs(items):
    pairs = []
    for i1 in items:
        for i2 in items:
            if i1 == i2:
                # this ensures that we don't get to the diagonal line in the pairing matrix
                # as order doesn't matter, this yields just half of the matrix (excluding the diagonal)
                break
            pairs.append((i1, i2))
    return pairs


print("Building edge list")
all_edges = []
for concept_list in tqdm(list(df[CONCEPTS_COLNAME])):
    concept_ids = {
        lookup[c] for c in concept_list if lookup.get(c) is not None
    }  # set comprehension because rake doesn't filter out duplicates

    for v1, v2 in get_pairs(concept_ids):
        all_edges.append(np.array((v1, v2)))


all_edges = np.array(all_edges)

Number of concepts: 1384427 -> 

100%|████████████████████████████████████████████████████████████████████████████| 1384427/1384427 [00:00<00:00, 4679835.96it/s]


147297
Building edge list


100%|████████████████████████████████████████████████████████████████████████████████| 195370/195370 [00:08<00:00, 22674.76it/s]


In [8]:
all_edges[:5]

array([[123396,  73153],
       [119910,  73153],
       [119910, 123396],
       [ 73223,  73153],
       [ 73223, 123396]])