In [10]:
import re
import os
import json
import time
import nltk
import argparse
import pymorphy2
import pickle as pkl
import os.path as op
import networkx as nx
from tqdm import tqdm
from copy import deepcopy
from stop_words import get_stop_words

In [58]:
class CatIndex():
    
    ru = pymorphy2.MorphAnalyzer()
    en_linkable = ["NN", "VB"]
    ru_linkable = ["NOUN", "VERB"]
    en_stops = get_stop_words('en')
    ru_stops = get_stop_words('ru')
    
    @classmethod
    def empty(C):
        g = nx.Graph()
        last = None
        return(C(g, last))
    
    @staticmethod
    def load(fn):
        with open(fn, "rb") as ih:
            return(pkl.load(ih))
        
    def __init__(self, g, last):
        self.g = g
        self.last = last
    
    @staticmethod
    def examine(s, do_implicits=False):
        explicits = re.findall("\[\[\w+\]\]", s)
        explicits = [a.replace("[[", "").replace("]]", "").lower() for a in explicits]
        implicits = []
        if do_implicits:
            spl = list(
                filter(
                    lambda x: re.match("\w+", x) and x not in CatIndex.en_stops and x not in CatIndex.ru_stops, 
                    re.split("\s", s)
                )
            )
            for a in spl:
                ru = CatIndex.ru.parse(a)
                for b in ru:
                    for c in CatIndex.ru_linkable:
                        if str(b.tag).startswith(c):
                            implicits.append(a)
                try:
                    en = nltk.pos_tag([a])
                except:
                    pass
                else:
                    for b in en:
                        for c in CatIndex.en_linkable:
                            if b[1].startswith(c):
                                implicits.append(a)
        r = {
            "explicits": explicits, "implicits": implicits
        }
        return(r)
    
    def save(self, fn):
        with open(fn, "wb") as oh:
            pkl.dump(self, oh)
    
    def index(self, list_of_files, do_implicits=False):
        lst = sorted(
            deepcopy(list_of_files), 
            key=lambda x: int(op.split(x)[-1].replace(".md", ""))
        )
        if self.last:
            lst = lst[lst.index(self.last):]
        for a in tqdm(list_of_files):
            with open(a, "r") as ih:
                a_file = ih.read().lower()
            a_linkables = CatIndex.examine(a_file, do_implicits)
            a_cid = op.split(a)[-1].replace(".md", "")
            for b in lst:
                if a != b:
                    with open(b, "r") as ih:
                        b_file = ih.read()
                    b_linkables = CatIndex.examine(b_file, do_implicits)
                    ebunch = {}
                    add_edge = False
                    for c in b_linkables:
                        ints = list(
                            set(a_linkables[c]).intersection(set(b_linkables[c]))
                        )
                        if len(ints) > 0:
                            add_edge = True
                            ebunch[c] = ",".join(ints)
                        else:
                            add_edge = False
                        if add_edge:
                            b_cid = op.split(b)[-1].replace(".md", "")
                            self.g.add_edges_from([(a_cid, b_cid, ebunch)])

## Initialization

In [59]:
I = CatIndex.empty()

In [60]:
files = [a for a in os.walk("/home/bakirillov/exocortex/cards/")][0][2]
files = list(filter(lambda x: ".md" in x, [op.join("/home/bakirillov/exocortex/cards/", a) for a in files]))

In [61]:
I.index(files, True)

100%|██████████| 27/27 [00:21<00:00,  1.24it/s]


In [62]:
u = [a for a in I.g.edges]

## Explicit links

In [57]:
len(u)

17

In [63]:
len(u)

130

In [51]:
u

[('01062020193116', '01062020194307'),
 ('01062020193116', '15062020115055'),
 ('01062020194307', '15062020115055'),
 ('15062020115055', '09062020203928'),
 ('15062020115055', '11062020065219'),
 ('01062020131840', '01062020092806'),
 ('01062020092806', '02062020000421'),
 ('07062020112220', '07062020105212'),
 ('07062020112220', '13062020070230'),
 ('07062020105212', '13062020070230'),
 ('10062020021421', '12062020015744'),
 ('10062020021421', '13062020140024'),
 ('12062020015744', '08062020103846'),
 ('09062020203928', '11062020065219'),
 ('06062020215251', '06062020222710'),
 ('06062020215251', '10062020112919'),
 ('06062020222710', '10062020112919')]

In [13]:
I.save("test.pkl")

In [3]:
I = CatIndex.load("test.pkl")