In [12]:
%run lib.py

---

In [13]:
from pipe import *
from taxonomy import print_tree
import pgfs
from anytree.importer import JsonImporter

In [14]:
importer =  JsonImporter()

with open('ds_taxonomy.json', 'r') as f:
    tax = importer.read(f)

In [None]:
print_tree(tax)

In [16]:
clusters = pd.read_excel('data/cluster_topics_u_nonzero.xlsx', index_col=[0,1])

In [17]:
cluster_memberships = (
    clusters.loc[5, 'u']
    .loc[lambda x: x > 0.1] # filter noise
    .pipe(lambda x: x / np.sqrt((x ** 2).sum())) # normalize
    .to_dict()
)
cluster_elements = set(list(cluster_memberships.keys()))

In [18]:
lifter = pgfs.PGFS(tax, lmbda=0.1, gamma=0.9)

lifted = lifter.lift(cluster_memberships)

In [19]:
cluster_elements

{'2.1.1.1. -- Bayesian networks',
 '2.1.1.2. -- Markov networks',
 '2.1.1.6. -- Causal networks',
 '3.1.1.3.2. -- Network data models',
 '3.1.3.9.2. -- Deadlocks',
 '3.4.7.2.1. -- Image search',
 '3.4.7.3.2. -- Desktop search',
 '5.1.2.2. -- Semantic networks',
 '5.2.3.13.1. -- Deep belief networks',
 '5.2.3.3.3.1 -- Rule-based netwok archirtecture',
 '5.2.3.5.6. -- Bayesian network models',
 '5.2.3.5.7. -- Markov network models'}

In [20]:
gaps = lifted.L | as_list | sort
heads = lifted.H - cluster_elements | as_list | sort 
offs = cluster_elements & lifted.H | as_list | sort 
clust =  [(i, round(j, 3)) for i, j in cluster_memberships.items()]

gaps_str = gaps | concat('\n')
heads_str = heads | concat('\n')
offs_str = offs | concat('\n')
clust_str =  clust | concat('\n') 


print(f'Cluster elements: \n{clust_str}\n\n'
      f'Head subjects: \n{heads_str}\n\n'
      f'Offshoots: \n{offs_str}\n\n'
      f'Gaps: \n{gaps_str}\n\n'
      f'p = {lifted.p}')
      

Cluster elements: 
('2.1.1.2. -- Markov networks', 0.456)
('2.1.1.6. -- Causal networks', 0.454)
('5.2.3.13.1. -- Deep belief networks', 0.418)
('5.1.2.2. -- Semantic networks', 0.365)
('2.1.1.1. -- Bayesian networks', 0.347)
('5.2.3.5.7. -- Markov network models', 0.176)
('5.2.3.5.6. -- Bayesian network models', 0.172)
('5.2.3.3.3.1 -- Rule-based netwok archirtecture', 0.168)
('3.4.7.2.1. -- Image search', 0.14)
('3.1.1.3.2. -- Network data models', 0.137)
('3.4.7.3.2. -- Desktop search', 0.134)
('3.1.3.9.2. -- Deadlocks', 0.113)

Head subjects: 
2.1.1. -- Probabilistic representations

Offshoots: 
3.1.1.3.2. -- Network data models
3.1.3.9.2. -- Deadlocks
3.4.7.2.1. -- Image search
3.4.7.3.2. -- Desktop search
5.1.2.2. -- Semantic networks
5.2.3.13.1. -- Deep belief networks
5.2.3.3.3.1 -- Rule-based netwok archirtecture
5.2.3.5.6. -- Bayesian network models
5.2.3.5.7. -- Markov network models

Gaps: 
2.1.1.3. -- Factor graphs
2.1.1.4. -- Decision diagrams
2.1.1.5. -- Equational model

---

Batch

In [21]:
lifter = pgfs.PGFS(tax, lmbda=0.1, gamma=0.9)

In [22]:
lifting_df = []
lifted_trees = []

for cluster_id in clusters.index.levels[0]:
    
    cluster_memberships = (
        clusters.loc[cluster_id, 'u']
        .loc[lambda x: x > 0.1] # filter noise
        .pipe(lambda x: x / np.sqrt((x ** 2).sum())) # normalize
        .to_dict()
    )
    cluster_elements = set(list(cluster_memberships.keys()))

    lifted = lifter.lift(cluster_memberships)

    gaps = lifted.L | as_list | sort
    heads = lifted.H - cluster_elements | as_list | sort 
    offs = cluster_elements & lifted.H | as_list | sort 
    clust =  [(i, round(j, 3)) for i, j in cluster_memberships.items()]

    df = (
        pd.concat((
            pd.Series(clust, name='topics_and_memberships'), 
            pd.Series(heads, name='head_subjects'), 
            pd.Series(gaps, name='gaps'), 
            pd.Series(offs, name='offshoots')), axis=1)
        .assign(cluster_id=cluster_id)
        .set_index('cluster_id', append=True)
        .reorder_levels([1,0])
    )

    lifting_df.append(df)
    lifted_trees.append(lifted)

lifting_df = pd.concat(lifting_df)


In [23]:
lifting_df.to_excel('data/lifting_results.xlsx', index=True)


In [24]:
joblib.dump(lifted_trees, 'data/lifted_trees.pkl')

['data/lifted_trees.pkl']