## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

## Input data directory
data_dir = "OrfPathHealth"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir Loader
documents = PyPDFDirectoryLoader(inputdirectory).load()
## File Loader
# documents = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf").load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))


Number of chunks =  226


## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(226, 4)


Unnamed: 0,text,source,page,chunk_id
0,147\n147Mental Health as a \nPathway to Health...,data_input/OrfPathHealth/MentalHealthPathwayTo...,0,6a13f1b883a641d698148be3652fae52
1,"sense, mental health encompasses promotive and...",data_input/OrfPathHealth/MentalHealthPathwayTo...,0,9867158fca1e4e2e9e6c9a70472fea37
2,148Accelerating Global Health: Pathways to Hea...,data_input/OrfPathHealth/MentalHealthPathwayTo...,1,6c16b5fb1bdf4e7481a58a41f3935d01
3,adversity is a recognised risk factor in condi...,data_input/OrfPathHealth/MentalHealthPathwayTo...,1,310128b7580441d99c7ac8466a60c4df
4,both in healthcare service delivery.\nThe chal...,data_input/OrfPathHealth/MentalHealthPathwayTo...,1,995ff2fe191a41a2ae6e99b87beb01fc


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2ConceptsList
from helpers.df_helpers import concepts2Df

In [9]:
concepts_list = df2ConceptsList(df[10:12])

 [
    {
        "entity": "income",
        "importance": 4,
        "category": "condition"
    },
    {
        "entity": "mental illness",
        "importance": 5,
        "category": "concept"
    },
    {
        "entity": "middle-aged",
        "importance": 3,
        "category": "condition"
    },
    {
        "entity": "mental healthcare",
        "importance": 4,
        "category": "object"
    },
    {
        "entity": "mental health professionals",
        "importance": 3,
        "category": "organisation"
    },
    {
        "entity": "mental health literacy",
        "importance": 3,
        "category": "condition"
    },
    {
        "entity": "stigma and discrimination",
        "importance": 2,
        "category": "misc"
    }
] [
    {
        "entity": "mental health equity",
        "importance": 4,
        "category": "concept"
    },
    {
        "entity": "social inequality",
        "importance": 3,
        "category": "condition"
    },
    {
        "e

In [26]:
dfne = concepts2Df(concepts_list)
dfne.head()

Unnamed: 0,entity,importance,category,chunk_id,type
0,Mental Health,5,concept,83d4d0367bb0467e811782a4ada3bbb9,concept
1,Health Equity,4,concept,83d4d0367bb0467e811782a4ada3bbb9,concept
2,World Health Organization (WHO),3,organisation,83d4d0367bb0467e811782a4ada3bbb9,concept
3,United Nations (UN),3,organisation,83d4d0367bb0467e811782a4ada3bbb9,concept
4,Sustainable Development Goals (SDGs),4,document,83d4d0367bb0467e811782a4ada3bbb9,concept


### Write CSV to an output directory

Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of names entities

        df = dataframe of chunks

In [28]:
if not os.path.exists(outputdirectory):
   os.makedirs(outputdirectory)
   
dfne.to_csv(outputdirectory/"concepts.csv", sep="|", index=False)
df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)

## Named Entities from Concepts

**Not using this right now**

Extracting named entities our of concepts. 


In [3]:
from transformers import pipeline

ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
# ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")

def row2NamedEntities(row):
    ner_results = ner(row['entity'])
    metadata = {'chunk_id': row['chunk_id'], 'type': 'entity'}
    entities = []
    for result in ner_results:
        entities = entities + [{'entity': result['word'], 'catetory': result['entity_group'], **metadata}]
        
    return entities



def dfText2DfNE(dataframe: pd.DataFrame):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a entity and a chunk_id column. 

    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1).reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    # entities_dataframe = entities_dataframe.groupby(['entity', 'category', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [5]:

dataframe_dir = 'OrfPathHealth'
df_concepts = pd.read_csv(f"./data/output/{dataframe_dir}/concepts.csv", sep="|")

dfc_split = dfText2DfNE(df_concepts)
dfc_split

Unnamed: 0,entity,catetory,chunk_id,type
0,Mental Health,MISC,83d4d0367bb0467e811782a4ada3bbb9,entity
1,Health Equity,ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
2,World Health Organization (WHO),ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
3,United Nations (UN),ORG,83d4d0367bb0467e811782a4ada3bbb9,entity
4,Sustainable Development Goals (SDGs),MISC,83d4d0367bb0467e811782a4ada3bbb9,entity
...,...,...,...,...
967,National University of Singapore's Saw Swee Ho...,ORG,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
968,Associate Professor,MISC,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
969,Jeremy Lim,PER,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
970,Janice Tan,PER,4dbae5e4a3ee45bdbf9d55dec8921c2c,entity
