In [1]:
import pymupdf
import os
import pandas as pd

# Text analysis
import spacy
import scispacy
import kindred
import en_core_web_sm

from spacy import displacy

# Networking
import networkx as nx
from pyvis import network as net
from pyvis.network import Network



In [2]:
print(pymupdf.__doc__)

PyMuPDF 1.25.1: Python bindings for the MuPDF 1.25.2 library (rebased implementation).
Python 3.11 running on linux (64-bit).



# Extracting text from articles

In [3]:
article = []

for filename in os.listdir("../input/"):
    doc = pymupdf.open(f"../input/{filename}")
    for page in doc:
        text = str(page.get_text("text"))
        text = text.replace("\n", "")
        article.append(text)
    with open(f"../build/{filename}.txt", "w") as file: # Save as a text file in build for each pdf file
        file.write(str(article))

In [4]:
article = str(article)

# Identify Scientific vocabulary

In [5]:
# python -m spacy download en_core_web_sm --> run in terminal

In [6]:
nlp = spacy.load("en_core_sci_sm") # Load the model

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [7]:
document = nlp(article)
#print(list(document.sents))

In [8]:
#print(document.ents) # Print a list of selected scientific words

In [9]:
#displacy.render(next(document.sents), style='dep', jupyter=True)

# Create a Dataframe from the extracted data

In [10]:
data = []
df = pd.DataFrame()

for filename in os.listdir("../build/"):
    url = str(f"../build/{filename}")
    file = open(url, "r").read()
    document = nlp(file)
    #print([(ent.text) for ent in document.ents])
    for ent in document.ents:
        data.append(ent)

df["weight"] = int(1)
df["keyword"] = data
df["filename"] = filename
df["filename"] = df["filename"].str[:-4]

df.to_csv("../table/keywords.csv")

In [11]:
table = pd.read_csv("../table/filter.csv") # Handmade filter imported from table
df = pd.read_csv("../table/keywords.csv")

In [55]:
# Sort the dataframe and ponderate depending on the number of occurences
filtered_keywords = pd.merge(df, table, on=["keyword"])
filtered_keywords = filtered_keywords.sort_values(by=['filename', 'keyword'])
collapsed_df = filtered_keywords.groupby(['filename', 'keyword']).size().reset_index(name="weight")

filtered_keywords.to_csv("../output/test.csv")

In [56]:
collapsed_df

Unnamed: 0,filename,keyword,weight
0,nihms27532.pdf,GABA,46
1,nihms27532.pdf,brain,29
2,nihms27532.pdf,pathway,3


# Create a network from keywords and articles

In [57]:
G = nx.Graph()
G.add_weighted_edges_from([(1, 2, 0.125), (1, 3, 0.75), (2, 4, 1.2), (3, 4, 0.375)])
for n, nbrs in G.adj.items():
   for nbr, eattr in nbrs.items():
       wt = eattr['weight']
       if wt < 0.5: print(f"({n}, {nbr}, {wt:.3})")

(1, 2, 0.125)
(2, 1, 0.125)
(3, 4, 0.375)
(4, 3, 0.375)


In [68]:
G = nx.Graph()

for index, row in collapsed_df.iterrows():
    G.add_weighted_edges_from([(row['filename'], row['keyword'], row['weight'])])
    for n, nbrs in G.adj.items():
        for nbr, eattr in nbrs.items():
            wt = eattr['weight']
            if wt < 0.5: print(f"({n}, {nbr}, {wt:.3})")

In [69]:
## Use Pyvis to generate a dynamical network representation
g = net.Network(height="750px", width="100%", font_color="white", select_menu=True, notebook=True) # bgcolor="#222222",
g = net.Network(directed = True, notebook=True)

node_degree = dict(G.degree) # Count the degree of the node

# Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

nxg = G
g.from_nx(nxg)
g.toggle_physics(True) # Toggle the physic in-between nodes
g.show(name="example.html", local=True, notebook=True)
# g.show('example.html')

example.html
