# Classification Trees

In [2]:
import pickle
import random
import numpy as np
from igraph import *
from scipy import stats
from scipy.special import cbrt
import sqlite3 as lite

### Create Classification Trees

#### Create Dewey Decimal Tree

* Build tree from raw text

In [4]:
T=Graph(directed=True)
T.add_vertex('Books',label='Books')
for line in open('../Classifications/dd.txt'):
    data=line.strip().split()
    T.add_vertex(data[0],label=' '.join(data[1:]))
    if len(data[0])==1:
        T.add_edge('Books',data[0])
    else:
        T.add_edge(data[0][:-1],data[0])       

* Add science fields (not necessary)

In [None]:
'''
T.add_vertex('5Physical',label='Physical Sciences',kb=0,books=set(),kr=0)
T.add_vertex('5Life',label='Life Scicences',kb=0,books=set(),kr=0)
T.add_edge('5','5Physical')
T.add_edge('5','5Life')

v=T.vs.find(label='Astronomy')
T.delete_edges(('5',v['name']))
T.add_edge('5Physical',v['name'])

v=T.vs.find(label='Physics')
T.delete_edges(('5',v['name']))
T.add_edge('5Physical',v['name'])

v=T.vs.find(label='Chemistry')
T.delete_edges(('5',v['name']))
T.add_edge('5Physical',v['name'])

v=T.vs.find(label='Earth sciences & geology')
T.delete_edges(('5',v['name']))
T.add_edge('5Physical',v['name'])

v=T.vs.find(label='Fossils & prehistoric life')
T.delete_edges(('5',v['name']))
T.add_edge('5Life',v['name'])

v=T.vs.find(label='Life sciences')
T.delete_edges(('5',v['name']))
T.add_edge('5Life',v['name'])

v=T.vs.find(label='Plants')
T.delete_edges(('5',v['name']))
T.add_edge('5Life',v['name'])

v=T.vs.find(label='Zoological sciences/Animals')
T.delete_edges(('5',v['name']))
T.add_edge('5Life',v['name'])
'''

In [5]:
pickle.dump(T,open('../Classifications/dd_tree.pkl','wb'))

#### Create the Library of Congress Tree

* Build tree from raw text

In [6]:
T=Graph(directed=True)
T.add_vertex('Books',label='Books')
level2=''
level3=''
level4=''
level5=''
level6=''
infile=open('../Classifications/loc_fix.txt')
for line in infile:
    if not line.strip():
        continue
    if line[0]=='?':
        continue
    data=line.strip().split()
    if data[0]=='LIBRARY':
        continue
    if data[0]=='Subclass':
        continue
    if data[0]=='CLASS':
        T.add_vertex(data[1],label=' '.join(data[3:]))
        T.add_edge('Books',data[1])
    else:
        if line[0]==' ':
            continue
        if line[0]=='\t':
            continue
        if line.count('\t')==1:
            T.add_vertex(data[0],label=' '.join(data[1:]))
            if line[0]=='F':
                T.add_edge('E',data[0])
            else:
                T.add_edge(line[0],data[0])
            level2=data[0]
        elif line.count('\t')==2:
            T.add_vertex(data[0],label=' '.join(data[1:]))
            T.add_edge(level2,data[0])
            level3=data[0]
        elif line.count('\t')==3:
            T.add_vertex(data[0],label=' '.join(data[1:]))
            T.add_edge(level3,data[0])
            level4=data[0]
        elif line.count('\t')==4:
            T.add_vertex(data[0],label=' '.join(data[1:]))
            T.add_edge(level4,data[0])
#            level5=data[0]
#        elif line.count('\t')==5:
#            T.add_edge(level5,data[0])
#            level6=data[0]
#        elif line.count('\t')==6:
#            T.add_edge(level6,data[0])

infile.close() 
for t in T.vs:
    t['label']=t['label'].replace('\x1e','-')

In [7]:
pickle.dump(T,open('../Classifications/loc_tree.pkl','wb'))

#### Create Amazon Tree

* Build tree from raw data

In [10]:
amazon_db = lite.connect("amazon.db")
amazon_db.text_factory = str
cur=amazon_db.cursor()
cur.execute("select id, isbn, c1,c2,c3,c4,c5,c6,c7,c8,c9,c10 from unique_book_attribute")
book_entries = cur.fetchall()
topics = set()
for entry in book_entries:
    topics.update(entry[2:12])
topics.discard('');
topics.discard(None);

T=Graph(directed=True)
nodes=set()
for key in topics:
    t=key.strip().split('>>')
    if t[0] not in nodes:
        nodes.add(t[0])
        T.add_vertex(t[0],label=t[0])
    for i in xrange(1,len(t)):
        name='>>'.join(t[:(i+1)])
        if name not in nodes:
            nodes.add(name)
            T.add_vertex(name,label=t[i])
            T.add_edge('>>'.join(t[:i]),name)

* Add fiction and nonfiction nodes (not neccessary)

In [None]:
'''
T.add_vertex('Books>>Fiction',label='Fiction')
T.add_vertex('Books>>NonFiction',label='NonFiction')
for u in T.vs[0].neighbors():
    if u['name']=="Books>>Calendars" or u['name']=="Books>>Science&Math" or u['name']=="Books>>Politics&SocialSciences":
        pass
    else:
        T.delete_edges(('Books',u['name']))
        T.add_edge('Books>>NonFiction',u['name'])

T.add_edge('Books','Books>>Fiction')
T.add_edge('Books','Books>>NonFiction')
v=T.vs.find('Books>>Religion&Spirituality')
T.add_vertex(name='Books>>Fiction>>Religion&Spirituality',label=v['label'])
T.add_edge('Books>>Fiction','Books>>Fiction>>Religion&Spirituality')
T.delete_edges(('Books>>Religion&Spirituality','Books>>Religion&Spirituality>>Fiction'))
T.add_edge('Books>>Fiction>>Religion&Spirituality','Books>>Religion&Spirituality>>Fiction')

v=T.vs.find("Books>>Children'sBooks")
T.add_vertex(name="Books>>Fiction>>Children'sBooks",label=v['label'])
T.add_edge('Books>>Fiction',"Books>>Fiction>>Children'sBooks")
T.delete_edges(("Books>>Children'sBooks","Books>>Children'sBooks>>Literature&Fiction"))
T.add_edge("Books>>Fiction>>Children'sBooks","Books>>Children'sBooks>>Literature&Fiction")
T.delete_edges(("Books>>Children'sBooks","Books>>Children'sBooks>>FairyTales,FolkTales&Myths"))
T.add_edge("Books>>Fiction>>Children'sBooks","Books>>Children'sBooks>>FairyTales,FolkTales&Myths")
T.delete_edges(("Books>>Children'sBooks","Books>>Children'sBooks>>ScienceFiction&Fantasy"))
T.add_edge("Books>>Fiction>>Children'sBooks","Books>>Children'sBooks>>ScienceFiction&Fantasy")

T.delete_edges(("Books>>NonFiction","Books>>Literature&Fiction"))
T.add_edge('Books>>Fiction',"Books>>Literature&Fiction")

v=T.vs.find("Books>>Teens")
T.add_vertex(name="Books>>Fiction>>Teens",label=v['label'])
T.add_edge('Books>>Fiction',"Books>>Fiction>>Teens")
T.delete_edges(("Books>>Teens","Books>>Teens>>ScienceFiction&Fantasy"))
T.add_edge('Books>>Fiction>>Teens',"Books>>Teens>>ScienceFiction&Fantasy")
T.delete_edges(("Books>>Teens","Books>>Teens>>Literature&Fiction"))
T.add_edge('Books>>Fiction>>Teens',"Books>>Teens>>Literature&Fiction")

v=T.vs.find("Books>>ChristianBooks&Bibles")
T.add_vertex(name="Books>>Fiction>>ChristianBooks&Bibles",label=v['label'])
T.add_edge('Books>>Fiction',"Books>>Fiction>>ChristianBooks&Bibles")
T.delete_edges(("Books>>ChristianBooks&Bibles","Books>>ChristianBooks&Bibles>>Literature&Fiction"))
T.add_edge('Books>>Fiction>>ChristianBooks&Bibles',"Books>>ChristianBooks&Bibles>>Literature&Fiction")

v=T.vs.find("Books>>Gay&Lesbian")
T.add_vertex(name="Books>>Fiction>>Gay&Lesbian",label=v['label'])
T.add_edge('Books>>Fiction',"Books>>Fiction>>Gay&Lesbian")
T.delete_edges(("Books>>Gay&Lesbian","Books>>Gay&Lesbian>>Literature&Fiction"))
T.add_edge('Books>>Fiction>>Gay&Lesbian',"Books>>Gay&Lesbian>>Literature&Fiction")

T.delete_edges(("Books>>NonFiction","Books>>Mystery,Thriller&Suspense"))
T.add_edge('Books>>Fiction',"Books>>Mystery,Thriller&Suspense")
T.delete_edges(("Books>>NonFiction","Books>>Comics&GraphicNovels"))
T.add_edge('Books>>Fiction',"Books>>Comics&GraphicNovels")
T.delete_edges(("Books>>NonFiction","Books>>Romance"))
T.add_edge('Books>>Fiction',"Books>>Romance")
T.delete_edges(("Books>>NonFiction","Books>>ScienceFiction&Fantasy"))
T.add_edge('Books>>Fiction',"Books>>ScienceFiction&Fantasy")
'''

In [11]:
pickle.dump(T,open('../Classifications/amazon_tree.pkl','wb'))

#### Create json files from the trees

In [12]:
def makeJSON(T,filename):    
    def traverse(node,depth):
        if T.vs[node].degree(OUT)>0:      
            outfile.write(depth*'\t'+'{{ "name" : "{}", "children" : [\n'.format(T.vs[node]['label']))
            for u in T.neighbors(node,OUT)[:-1]:
                traverse(u,depth+1)
                outfile.write(',\n')
            traverse(T.neighbors(node,OUT)[-1],depth+1)
            outfile.write('\n')
            outfile.write(depth*'\t'+'] }')
        else:
            outfile.write(depth*'\t'+'{{ "name" : "{}" }}'.format(T.vs[node]['label']))
    
    with open(filename,'w') as outfile:
        traverse(0,0)

In [14]:
makeJSON(pickle.load(open('../Classifications/dd_tree.pkl')),'dd.json')
makeJSON(pickle.load(open('../Classifications/amazon_tree.pkl')),'amazon.json')
makeJSON(pickle.load(open('../Classifications/loc_tree.pkl')),'loc.json')