In [2]:
import pymupdf
import os
import shutil
import pandas as pd

# Text analysis
import spacy
import scispacy
import kindred
import en_core_web_sm
#import en_core_sci_sm

from spacy import displacy

# Networking
import networkx as nx
from pyvis import network as net
from pyvis.network import Network

In [3]:
print(pymupdf.__doc__)

PyMuPDF 1.25.3: Python bindings for the MuPDF 1.25.4 library (rebased implementation).
Python 3.12 running on linux (64-bit).



# Extracting text from articles

In [8]:
article = []

for folder in os.listdir("../input/"):
    for filename in os.listdir(f"../input/{folder}"):
        doc = pymupdf.open(f"../input/{folder}/{filename}")
        for page in doc:
            text = str(page.get_text("text"))
            text = text.replace("\n", "")
            article.append(text)
        with open(f"../build/txt/{filename}.txt", "w") as file: # Save as a text file in build for each pdf file
            file.write(str(article))

In [5]:
article = str(article)

# Identify Scientific vocabulary

In [5]:
# python -m spacy download en_core_web_sm --> run in terminal

In [6]:
nlp = spacy.load("en_core_web_sm") # Load the model

In [7]:
document = nlp(article)
#print(list(document.sents))

In [8]:
#print(document.ents) # Print a list of selected scientific words

In [9]:
#displacy.render(next(document.sents), style='dep', jupyter=True)

# Create a Dataframe from the extracted data

In [53]:
for filename in os.listdir("../build/txt/"):
    data = []
    df = pd.DataFrame()
    
    url = str(f"../build/txt/{filename}")
    file = open(url, "r").read()
    document = nlp(file)
    #print([(ent.text) for ent in document.ents])
    for ent in document.ents:
        data.append(ent)

    df["weight"] = int(1)
    df["keyword"] = data
    df["filename"] = filename
    df["filename"] = df["filename"].str[:-4]

    df.to_csv(f"../build/csv/{filename}.csv")

In [55]:
# Define a function that deletes all files in a folder
def files_delete(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(filepath):
                os.unlink(file_path)
            elif os.path.isdri(filepath):
                shutil.rmtree(filepath)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

In [56]:
files_delete("../build/txt/")

In [31]:
# Build a mastersheet out of all the csv files in ./build/csv
#nb_files = len(os.listdir("../build/csv/"))
mastersheet = pd.DataFrame(columns = ['weight', 'keyword', 'filename'])
mastersheet["weight"] = int(1)

for filename in os.listdir("../build/csv/"):
    read_csv = pd.read_csv(f"../build/csv/{filename}")
    read_csv["weight"] = int(1)
    mastersheet = pd.concat([mastersheet, read_csv], axis=0)

mastersheet.to_csv("../build/mastersheet/mastersheet.csv")

In [32]:
table = pd.read_csv("../table/filter.csv") # Handmade filter imported from table
table

Unnamed: 0,keyword
0,GABA
1,Interneuron
2,pathway
3,brain
4,cell
5,cells
6,PV
7,L2
8,L1
9,CMM


In [36]:
# Sort the dataframe and ponderate depending on the number of occurences
filtered_keywords = pd.merge(mastersheet, table, on=["keyword"])
filtered_keywords = filtered_keywords.sort_values(by=['filename', 'keyword'])
collapsed_df = filtered_keywords.groupby(['filename', 'keyword']).size().reset_index(name="weight")

collapsed_df.to_csv("../output/test.csv")

In [37]:
collapsed_df

Unnamed: 0,filename,keyword,weight
0,S1234567817020070.pdf,CMM,19
1,S1234567817020070.pdf,L1,19
2,S1234567817020070.pdf,L2,35
3,S1234567817020070.pdf,PV,17
4,S1234567817020070.pdf,avian,11
5,nihms27532.pdf,CMM,26
6,nihms27532.pdf,GABA,80
7,nihms27532.pdf,GABAergic,88
8,nihms27532.pdf,L1,30
9,nihms27532.pdf,L2,35


# Create a network from keywords and articles

In [38]:
G = nx.Graph()
G.add_weighted_edges_from([(1, 2, 0.125), (1, 3, 0.75), (2, 4, 1.2), (3, 4, 0.375)])
for n, nbrs in G.adj.items():
   for nbr, eattr in nbrs.items():
       wt = eattr['weight']
       if wt < 0.5: print(f"({n}, {nbr}, {wt:.3})")

(1, 2, 0.125)
(2, 1, 0.125)
(3, 4, 0.375)
(4, 3, 0.375)


In [39]:
G = nx.Graph()

for index, row in collapsed_df.iterrows():
    G.add_weighted_edges_from([(row['filename'], row['keyword'], row['weight'])])
    for n, nbrs in G.adj.items():
        for nbr, eattr in nbrs.items():
            wt = eattr['weight']
            if wt < 0.5: print(f"({n}, {nbr}, {wt:.3})")

In [50]:
## Use Pyvis to generate a dynamical network representation
g = net.Network(width= "100%", height="100%", font_color="white", select_menu=True, bgcolor="#222222", notebook=True) # bgcolor="#222222",
g = net.Network(directed = False, notebook=True)

node_degree = dict(G.degree) # Count the degree of the node

# Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

nxg = G
g.from_nx(nxg)
g.toggle_physics(True) # Toggle the physic in-between nodes
g.show(name="../output/example.html", local=True, notebook=True)
# g.show('example.html')

../output/example.html


In [57]:
files_delete("../build/csv/")