In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json
import datetime
from dotenv import dotenv_values
from neomodel import (config, db, StructuredNode, StructuredRel,StringProperty, IntegerProperty, UniqueIdProperty, DateProperty, DateTimeProperty,RelationshipTo, RelationshipFrom, Q, Traversal, OUTGOING, INCOMING, EITHER)

In [3]:
env = dotenv_values("../.env")
#-----Neo4j database----- #
config.DATABASE_URL = env["NEO4J_BOLT_URL"]

class SubclassRelationship(StructuredRel):
    """
    A very simple relationship between two BasePersons that simply
    records the date at which an acquaintance was established.
    """
    on_date = DateProperty(default = datetime.datetime.now)

class SameasRelationship(StructuredRel):
    """
    A very simple relationship between two BasePersons that simply
    records the date at which an acquaintance was established.
    """
    on_date = DateProperty(default = datetime.datetime.now)

class DeweyDomain(StructuredNode):
    _id = UniqueIdProperty()
    meta_id = IntegerProperty(index=True,default=0)
    classe =  StringProperty(unique_index=True,required=True)
    name = StringProperty(unique_index=True,required=True)
    dewey_has_subclass = RelationshipTo("DeweyDomain", "HAS_SUBCLASS", model = SubclassRelationship)
    dewey_sameas_hal = RelationshipFrom("HalDomain", "SAME_AS", model = SameasRelationship)
    dewey_sameas_bso = RelationshipFrom("BsoDomain", "SAME_AS", model = SameasRelationship)
    def to_json(self):
        return {
        "_id": self._id,
        "meta_id": self.meta_id,
        "classe": self.classe,
        "name": self.name,
        "group": "dewey"
      }
    def post_save(self):
        """
        The hook methods pre_save and post_save are available on StructuredRel models. They are executed when calling save on the object directly or when creating a new relationship via connect.
        """
        print(self)

dewey_all_definition = dict(node_class=DeweyDomain, direction=EITHER,
                  relation_type=None, model=None)
dewey_sub_definition = dict(node_class=DeweyDomain, direction=OUTGOING,
                  relation_type='HAS_SUBCLASS', model=SubclassRelationship)
dewey_sameas_definition = dict(node_class=DeweyDomain, direction=INCOMING,
                  relation_type='SAME_AS', model=SameasRelationship)

class BsoDomain(StructuredNode):
    _id = UniqueIdProperty()
    meta_id = IntegerProperty(index=True,default=0)
    name = StringProperty(unique_index=True,required=True)
    bso_sameas_dewey = RelationshipTo("DeweyDomain", "SAME_AS", model = SameasRelationship)
    def to_json(self):
        return {
        "_id": self._id,
        "meta_id": self.meta_id,
        "name": self.name,
        "group": "hal"
      }

class HalDomain(StructuredNode):
    _id = UniqueIdProperty()
    meta_id = IntegerProperty(index=True,default=0)
    docid = StringProperty(unique_index=True,required=True)
    name = StringProperty(unique_index=True,required=True)
    code = StringProperty(unique_index=True,required=True)
    level = IntegerProperty(index=True,required=True)
    parent_id = IntegerProperty(index=True)
    hal_has_subclass = RelationshipTo("HalDomain", "HAS_SUBCLASS", model = SubclassRelationship)
    hal_sameas_dewey = RelationshipTo("DeweyDomain", "SAME_AS", model = SameasRelationship)
    def to_json(self):
        return {
        "_id": self._id,
        "meta_id": self.meta_id,
        "docid": self.docid,
        "name": self.name,
        "code": self.code,
        "level": self.level,
        "parent_id": self.parent_id,
        "group": "bso"
      }

hal_all_definition = dict(node_class=HalDomain, direction=EITHER,
                  relation_type=None, model=None)
hal_sub_definition = dict(node_class=HalDomain, direction=OUTGOING,
                  relation_type='HAS_SUBCLASS', model=SubclassRelationship)
hal_sameas_definition = dict(node_class=HalDomain, direction=OUTGOING,
                  relation_type='SAME_AS', model=SameasRelationship)

# Fonctions de récupération des données des 3 référentiels Dewey, Hal et BSO

## Dewey

### Source temporaire thèses Abes (Dewey partielle)

In [43]:
def  scrapping_oai_sets_dewey():
    url = "https://www.theses.fr/schemas/tef/recommandation/oai_sets.html"
    resp = requests.get(url).text  # ou f = http.request('GET', url).data
    soup = BeautifulSoup(resp, features="lxml")
    oai_list = []
    for row in soup.findAll("table")[0].findAll("tr"):
        label = re.sub('<!--.*-->|\r|\n', '', str(row.findAll("td")[0].get_text(strip=True)), flags=re.DOTALL)
        label = re.sub('\s{2,}|&nbsp;', ' ', label)
        oai_list.append(
            {
                "label": label,
                "code": row.findAll("td")[1].get_text(strip=True),
            }
        )
    df = pd.DataFrame(oai_list[1:])
    df.to_csv("data/dewey_abes.csv", sep=",", index=False, encoding='utf8')

scrapping_oai_sets_dewey()

### Wikipedia (Dewey complète)

In [4]:
main_classes = ['000','100','200','300','400','500','600','700','800','900']

In [5]:
def scrapping_wikipedia():
    url = "https://fr.wikipedia.org/wiki/Liste_des_classes_de_la_Classification_d%C3%A9cimale_de_Dewey"
    resp = requests.get(url).text  # ou f = http.request('GET', url).data
    soup = BeautifulSoup(resp, features="lxml")
    class_list = []
    for row in soup.findAll("span", {"class": "mw-headline"}):
        s = row.get_text(strip=True)
        if s[0:6] == "Classe":
            class_list.append(
            {
                "code": s[6:10].lstrip(),
                "label": s[12:].lstrip(),
            }
            )
    for row in soup.findAll("b")[1:]:
        s = row.get_text(strip=True)
        if s[0:3] not in main_classes:
            class_list.append(
            {
                "code": s[0:3].lstrip(),
                "label": s[3:].lstrip(),
            }
            )
    df = pd.DataFrame(class_list)
    df.to_csv("data/dewey_wikipedia.csv", sep=",", index=False, encoding='utf8')
            
scrapping_wikipedia()

### Source Hal

In [None]:
def get_hal_domains():
    """
    
    """
    url = 'http://api.archives-ouvertes.fr/ref/domain/?q=*:*'
    params = {'fl': 'docid,code_s,level_i,parent_i,label_s','rows': 400}
    resp = requests.get(url,params=params).text
    result = []
    if json.loads(resp)['response']['docs']:
        data = json.loads(resp)['response']['docs']
        for node in data:
            if not('parent_i' in node):
                parent_id = 0
            else:
                parent_id = node['parent_i']
            print(node)
            result.append({"docid": node['docid'], "level": node['level_i'],"parent_id": parent_id, "code": node['code_s'], "label": node['label_s']})
    else:
        pass
    # dedup in case of duplicate relatioships
    df = pd.DataFrame(result)
    df.to_csv("data/hal_domains.csv", sep=",", index=False, encoding='utf8')

get_hal_domains()

### Source BSO

Harvest done in a Colab Notebook [https://colab.research.google.com/drive/1uwS7CVt8pUhq8be-QpVS2qSE8s5YaZSA](https://colab.research.google.com/drive/1uwS7CVt8pUhq8be-QpVS2qSE8s5YaZSA).

The resulting bso_classification categories are manually reported in a dict :

In [12]:
dict_bso_classification = [{'label': 'Medical research'},{'label': 'Biology (fond.)'},{'label': 'Social sciences'},{'label': 'Earth, Ecology, Energy and applied biology'},
                           {'label': 'Chemistry'},{'label': 'Physical sciences, Astronomy'},{'label': 'Computer and information sciences'},{'label': 'Humanities'},{'label': 'Engineering'},{'label': 'Mathematics'}]

# Populate the db with Dewey classes

## Dewey Referentiel data

### Load data

In [6]:
dict_dewey = pd.read_csv("data/dewey_wikipedia.csv",sep=",",encoding="utf-8", dtype={"code": str, "label": str}).to_dict(orient='records')
main_classes = ['000','100','200','300','400','500','600','700','800','900']

### Create nodes

In [7]:
for rows in dict_dewey:
    DeweyDomain(classe=str(rows['code']), name=str(rows['label'])).save()

{'meta_id': 0, 'classe': '000', 'name': 'Informatique, information, ouvrages généraux', 'id': 0}
{'meta_id': 0, 'classe': '100', 'name': 'Philosophie,ParapsychologieetOccultisme,Psychologie', 'id': 1}
{'meta_id': 0, 'classe': '200', 'name': 'Religions', 'id': 2}
{'meta_id': 0, 'classe': '300', 'name': 'Sciences sociales', 'id': 3}
{'meta_id': 0, 'classe': '400', 'name': 'Langues', 'id': 4}
{'meta_id': 0, 'classe': '500', 'name': 'Sciencesde la nature etMathématiques', 'id': 5}
{'meta_id': 0, 'classe': '600', 'name': 'Technologie (Sciences appliquées)', 'id': 6}
{'meta_id': 0, 'classe': '700', 'name': 'Arts,LoisirsetSports', 'id': 7}
{'meta_id': 0, 'classe': '800', 'name': 'Littérature(Belles-Lettres) et techniques d’écriture', 'id': 8}
{'meta_id': 0, 'classe': '900', 'name': 'Géographie,Histoireet disciplines auxiliaires', 'id': 9}
{'meta_id': 0, 'classe': '010', 'name': 'Bibliographies', 'id': 10}
{'meta_id': 0, 'classe': '020', 'name': "Sciences de l'information et des bibliothèques"

### Create subclass relationships

In [8]:
for c in main_classes:
    main = DeweyDomain.nodes.get(classe=c)
    sub = DeweyDomain.nodes.filter(Q(classe__startswith = main.classe[0:1]), Q(classe__ne = main.classe))
    for x in sub:
        main.dewey_has_subclass.connect(x)

## Hal referentiel data

### Load data

In [9]:
dict_hal = pd.read_csv("data/hal_domains.csv",sep=",",encoding="utf-8").to_dict(orient='records')

### Create nodes

In [10]:
for rows in dict_hal:
    HalDomain(docid=rows['docid'], name=rows['label'].split("=")[1],code=rows['code'],level=rows["level"],parent_id=rows['parent_id']).save()

### Create relationships

In [11]:
sub_levels_all = HalDomain.nodes.filter(level__ne=0)
for x in sub_levels_all:
    current_level =  HalDomain.nodes.get(docid=x.docid)
    top_level = HalDomain.nodes.get(docid=x.parent_id)
    top_level.hal_has_subclass.connect(current_level)

## BSO referentiel

In [13]:
for rows in dict_bso_classification:
    b = BsoDomain(name=rows['label']).save()   

## Add a meta_id to every node to be requested bu the neomodel libraries's methods

In [14]:
Z = db.cypher_query("match (n) set n.meta_id = id(n) return n")

 # Resulting graph

!["neo4j_screenshot"](../static/img/neo4j_screenshot.png)

# Graph request examples

## Get a specific node by id

In [14]:
dewey_node = DeweyDomain.nodes.get(meta_id=5)
dewey_node

<DeweyDomain: {'meta_id': 5, 'classe': '500', 'name': 'Sciencesde la nature etMathématiques', 'id': 5}>

## Traversal HAS_SUBCLASS from dewey nodes

In [None]:
dewey_sub_definition = dict(node_class=DeweyDomain, direction=OUTGOING,
                  relation_type='HAS_SUBCLASS', model=SubclassRelationship)

### Starting from a specific node

In [4]:
dewey_node_700 = DeweyDomain.nodes.get(classe=700)
relations_traversal = Traversal(dewey_node_700, DeweyDomain.__label__,
                                dewey_sub_definition)
all_700_relations = relations_traversal.all()
print(all_700_relations)

[<DeweyDomain: {'id': 77, 'meta_id': 77, 'classe': '796', 'name': 'Sport'}>, <DeweyDomain: {'id': 76, 'meta_id': 76, 'classe': '790', 'name': 'Arts du spectacle, loisirs'}>, <DeweyDomain: {'id': 75, 'meta_id': 75, 'classe': '780', 'name': 'Musique'}>, <DeweyDomain: {'id': 74, 'meta_id': 74, 'classe': '770', 'name': 'Photographie et les photographies, art numérique'}>, <DeweyDomain: {'id': 73, 'meta_id': 73, 'classe': '760', 'name': 'Arts graphiques'}>, <DeweyDomain: {'id': 72, 'meta_id': 72, 'classe': '750', 'name': 'Peinture'}>, <DeweyDomain: {'id': 71, 'meta_id': 71, 'classe': '740', 'name': 'Dessin. Arts décoratifs'}>, <DeweyDomain: {'id': 70, 'meta_id': 70, 'classe': '730', 'name': 'Arts plastiques. Sculpture'}>, <DeweyDomain: {'id': 69, 'meta_id': 69, 'classe': '720', 'name': 'Architecture'}>, <DeweyDomain: {'id': 68, 'meta_id': 68, 'classe': '710', 'name': 'Urbanisme'}>]


In [13]:
hal_node_108 = HalDomain.nodes.get(meta_id=108)
print(hal_node_108.hal_sameas_dewey.all())

[<DeweyDomain: {'id': 4, 'meta_id': 4, 'classe': '060', 'name': 'Organisations générales et muséologie'}>]


### Retrieve all subgraph

In [None]:
all_dewey_nodes = DeweyDomain.nodes.all()
for x in all_dewey_nodes:
    relations_traversal = Traversal(x, DeweyDomain.__label__,
                                dewey_all_definition) 
    if relations_traversal:
        print(x,relations_traversal.all())  

In [None]:
all_hal_nodes = HalDomain.nodes.all()
for x in all_hal_nodes:
    relations_traversal = Traversal(x, HalDomain.__label__,
                                hal_all_definition) 
    if relations_traversal:
        print(x,relations_traversal.all()) 