In [2]:
from pathlib import Path
import pandas as pd
import yaml
from neo4j_lib import NeoApp


In [3]:
# connect to neo4j
with Path("neo4j_config.yaml").open() as nc:
    neo_config = yaml.safe_load(nc)
    
neo_con = NeoApp(neo_config["uri"], neo_config["user"], neo_config["password"])

In [15]:
# 1. Who is most productive researcher writing papers on koalas?

query="""
MATCH (p)<-[:AUTHORED]-(a)
WITH a, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC
RETURN a.name, SIZE(papers)
"""
response=neo_con.query(query)
pd.DataFrame(response).head(5)

Unnamed: 0,a.name,SIZE(papers)
0,Peter Timms,90
1,Carel Thijs,57
2,Adam Polkinghorne,55
3,P J Canfield,31
4,Stephen D Johnston,25


In [17]:
# 2. What is the most common topic in publications on koalas?
# find most common keywords

query="""
MATCH (p)-[:HAS_KEYWORD]->(k)
WITH k, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC LIMIT 20
RETURN k.name, SIZE(papers)
"""
response=neo_con.query(query)
pd.DataFrame(response).head(10)

Unnamed: 0,k.name,SIZE(papers)
0,koala,66
1,Koala,51
2,Phascolarctos cinereus,41
3,Chlamydia,32
4,Chlamydia pecorum,20
5,koalas,17
6,koala retrovirus,14
7,KoRV,12
8,marsupial,11
9,Australia,8


In [6]:
# 1. What are diseases that affect the animal?
# return top 5 most common diseases and their paper count

"""
MATCH (p)-[:ABOUT_DISEASE]->(d)
WITH d, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC LIMIT 5
RETURN d.name, SIZE(papers)
"""

'\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWITH d, COLLECT(p) as papers\nORDER BY SIZE(papers) DESC LIMIT 5\nRETURN d.name, SIZE(papers)\n'

In [7]:
# 2. What are the rare diseases?
# return 5 least common diseases
"""
MATCH (p)-[:ABOUT_DISEASE]->(d)
WITH d, COLLECT(p) as papers
ORDER BY SIZE(papers) ASC LIMIT 5
RETURN d.name, SIZE(papers)
"""

'\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWITH d, COLLECT(p) as papers\nORDER BY SIZE(papers) ASC LIMIT 5\nRETURN d.name, SIZE(papers)\n'

In [8]:
# 4. What disease often occur in conjuction?
# 

In [9]:
# 5. What genetic factors contribute to these diseases?
# no genetic entities in graph, here are chemicals
# 7. Do chemical factors play role?
"""
MATCH (p)-[:ABOUT_DISEASE]->(d)
WITH d, COLLECT(p) as papers
WITH d ORDER BY SIZE(papers) DESC LIMIT 2
WITH collect(d) AS top_dis
UNWIND top_dis as d
MATCH (p)-[:ABOUT_DISEASE]->(d)
WITH p as paper
MATCH (p)-[:ABOUT_CHEMICAL]->(c)
RETURN c.name

"""

'\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWITH d, COLLECT(p) as papers\nWITH d ORDER BY SIZE(papers) DESC LIMIT 2\nWITH collect(d) AS top_dis\nUNWIND top_dis as d\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWITH p as paper\nMATCH (p)-[:ABOUT_CHEMICAL]->(c)\nRETURN c.name\n\n'

In [10]:
# 6. How does the prevalence of these diseases change over time
# filter papers by date and rerun disease search

"""
MATCH (p)-[:ABOUT_DISEASE]->(d)
WHERE p.date < 1980
WITH d, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC LIMIT 20
RETURN d.name, SIZE(papers)
"""

# insight: chlamydia started to appear after 1980s, 
# then became one of the top 5 koala diseases
"""

MATCH (p)-[:ABOUT_DISEASE]->(d)
WHERE p.date > 1979 and p.date < 1990
WITH d, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC LIMIT 20
RETURN d.name, SIZE(papers)


MATCH (p)-[:ABOUT_DISEASE]->(d)
WHERE p.date < 1990
WITH d, COLLECT(p) as papers
ORDER BY SIZE(papers) DESC LIMIT 20
RETURN d.name, SIZE(papers)
"""

# infections and chlamydia dominate the last 20 years of
# publications featuring koala diseases

'\n\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWHERE p.date > 1979 and p.date < 1990\nWITH d, COLLECT(p) as papers\nORDER BY SIZE(papers) DESC LIMIT 20\nRETURN d.name, SIZE(papers)\n\n\nMATCH (p)-[:ABOUT_DISEASE]->(d)\nWHERE p.date < 1990\nWITH d, COLLECT(p) as papers\nORDER BY SIZE(papers) DESC LIMIT 20\nRETURN d.name, SIZE(papers)\n'

In [11]:
# 8. Does region have some effect?
# 

# 0.1 how has the field evolved over time?
# what were the most common keywords in the 90's - a signal of
# for the koala

# doesn't seem to be keywords before 90's and not many in 00's,
# hard to make any comparison - probably not enough papers