In [10]:
from lib import *

---


In [14]:
tax_raw = (
    pd.read_excel('acm_css_taxonomy_modified.xlsx', header=None)
     .rename(columns={0:'level'}).iloc[3:459]
)

In [15]:
tax_raw.shape

(456, 11)

---

In [16]:
topics = set(tax_raw.iloc[:, 1:7].fillna('').sum(axis=1).str.strip().str.lower())

In [17]:
len(topics)

437

---

In [18]:
ds_tax = (
    tax_raw
    .fillna('')
    .assign(
        label=lambda x: x.iloc[:, 1:7].sum(axis=1).str.strip(),
        new=lambda x: x['level'].str.contains('\*'),
        level=lambda x: x['level'].str.replace(r'*', ''),
    )
    .assign(
        lvls=lambda x: x['level'].str.split('.')\
                                  .apply(lambda x: [y for y in x if y.isdecimal()]))
    .assign(
        depth=lambda x: x['lvls'].apply(len)
    )
    .loc[lambda x: x['label'] != '', ['level', 'label', 'depth', 'lvls', 'new']]
)

In [19]:
ds_tax.head()

Unnamed: 0,level,label,depth,lvls,new
3,1. + 0,Theory of computation,1,[1],False
4,1.1.,Theory and algorithms for application domains,2,"[1, 1]",False
5,1.1.1.,Machine learning theory,3,"[1, 1, 1]",False
6,1.1.1.1.,Sample complexity and generalization bounds,4,"[1, 1, 1, 1]",False
7,1.1.1.2.,Boolean function learning,4,"[1, 1, 1, 2]",False


In [24]:
root = anytree.Node('root')

nodes = [root]
prev_depth = 0
parent = root
for i, s in ds_tax.iterrows():
    curr_depth = s['depth']
    if curr_depth < prev_depth:
        
        for i in range(prev_depth - curr_depth):
            parent = parent.parent
            
    elif curr_depth > prev_depth:
        if curr_depth - prev_depth > 1:
            print(s)
            raise ValueError
        parent = nodes[-1]
        
    n = anytree.Node(s['level'] + ' -- ' + s['label'], raw_name=s['label'],parent=parent)
    nodes.append(n)
    
    prev_depth = curr_depth




In [25]:
# for pre, fill, node in anytree.RenderTree(root):
#     print(pre, node.name)


In [26]:
from anytree.exporter import JsonExporter
exporter = JsonExporter(indent=2)

In [27]:
with open('ds_taxonomy.json', 'w') as f:
    f.write(exporter.export(root))

In [28]:
leaves = [i.raw_name for i in anytree.LevelOrderIter(root, filter_=lambda x: x.is_leaf)]

In [29]:
len(leaves)

353

In [30]:
ds_tax['isleaf'] = ds_tax['label'].isin(leaves)

In [39]:
ds_tax = (ds_tax
    .assign(label=lambda x: (x['level'] + ' -- ' + x['label']))
         )

In [40]:
ds_tax.to_csv('data/ds_taxonomy_parsed_full.csv', index=False)

---

In [41]:
ds_tax.loc[lambda x: x.isleaf, 'new'].sum()

60

In [42]:
ds_tax.loc[:,  'new'].sum()

68

In [43]:
ds_tax.shape

(456, 6)

---

In [32]:
from pipe import *

In [33]:
(ds_tax.loc[lambda x: x['depth'] <= 2, ['level', 'label', 'depth']]
.assign(
    level=lambda x: x['level'].str.replace(' ', '')\
                              .str.replace('\+0', ''),
    label=lambda x: (x['depth'].apply(lambda y: '~~~~'*y) + x['label'])
)
 .loc[:, ['level', 'label']]
 .to_latex(index=False, escape=False)

) | stdout

\begin{tabular}{ll}
\toprule
level &                                                  label \\
\midrule
 1. &  ~~~~Theory of computation \\
 1.1. &  ~~~~~~~~Theory and algorithms for application domains \\
 2. &  ~~~~Mathematics of computing \\
 2.1. &  ~~~~~~~~Probability and statistics \\
 3. &  ~~~~Information systems \\
 3.1. &  ~~~~~~~~Data management systems \\
 3.2. &  ~~~~~~~~Information systems applications \\
 3.3. &  ~~~~~~~~World Wide Web \\
 3.4. &  ~~~~~~~~Information retrieval \\
 4. &  ~~~~Human-centered computing \\
 4.1. &  ~~~~~~~~Visualization \\
 5. &  ~~~~Computing methodologies \\
 5.1. &  ~~~~~~~~Artificial intelligence \\
 5.2. &  ~~~~~~~~Machine learning \\
\bottomrule
\end{tabular}


In [162]:
(ds_tax.loc[:,  ['level', 'label', 'depth']]
.assign(
    level=lambda x: x['level'].str.replace(' ', '')\
                              .str.replace('\+0', ''),
#     label=lambda x: (x['depth'].apply(lambda y: '~~~~'*y) + x['label'])
)
#  .loc[:, ['level', 'label']]
#  .to_latex(index=False, escape=False, longtable=True)
.set_index(['level', 'depth'], append=True)
.unstack()
.fillna('')
 .reset_index()
 .drop('level_0', axis=1)
 .to_latex(longtable=True, index=False)
 ) | stdout

\begin{longtable}{lllllll}
\toprule
        level & \multicolumn{6}{l}{label} \\
              &                         1 &                                              2 &                                             3 &                                                       4 &                                               5 &                                   6 \\
\midrule
\endhead
\midrule
\multicolumn{7}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
 1. &  Theory of computation &   &   &   &   &   \\
 1.1. &   &  Theory and algorithms for application domains &   &   &   &   \\
 1.1.1. &   &   &  Machine learning theory &   &   &   \\
 1.1.1.1. &   &   &   &  Sample complexity and generalization bounds &   &   \\
 1.1.1.2. &   &   &   &  Boolean function learning &   &   \\
 1.1.1.3. &   &   &   &  Unsupervised learning and clustering &   &   \\
 1.1.1.4. &   &   &   &  Kernel methods &   &   \\
 1.1.1.4.1. &   &   &   &   &  Support vector machines &  

---
Plot

In [34]:
from anytree import Resolver

In [72]:
node = Resolver().get(root, '/root/5.+0 -- Computing methodologies/5.1. -- Artificial intelligence/5.1.1. -- Natural language processing')

In [36]:
node = Resolver().get(root, '/root/5.+0 -- Computing methodologies/5.1. -- Artificial intelligence/5.1.3. -- Computer vision')

In [37]:
node.children

(Node('/root/5.+0 -- Computing methodologies/5.1. -- Artificial intelligence/5.1.3. -- Computer vision/5.1.3.1. -- Computer vision problems', raw_name='Computer vision problems'),
 Node('/root/5.+0 -- Computing methodologies/5.1. -- Artificial intelligence/5.1.3. -- Computer vision/5.1.3.2. -- Computer vision representations', raw_name='Computer vision representations'))

In [38]:
from anytree.exporter import DotExporter
dot = DotExporter(node)

In [74]:
dot.to_dotfile('tax_2.dot')

In [69]:
!dot -Tpdf tax_2.dot -o tax_2.pdf


  rankdir="LR";
    node [ fontname="arial bold", fontsize=20];

In [58]:
!dot -Tpdf tax.dot -o tax.pdf

---

In [77]:
root.depth

0

In [76]:
node.depth

3

In [78]:
from anytree import LevelOrderIter

In [80]:
for n in LevelOrderIter(root):
    if n.depth > 2:
        n.parent=None

In [81]:

dot = DotExporter(root)

In [82]:
dot.to_dotfile('tax_2upper.dot')

In [86]:
!dot -Tpdf tax_2upper.dot -o tax_2upper.pdf