In [217]:
import sys, os
module_path = os.path.abspath(os.path.join('..'))
module_path = os.path.join(module_path, 'scripts')
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\scripts


## Helper Functions

In [218]:
def matchLogic(strOriginal, strSearch):
    '''When the string from the document has a length of 20% or less than the 
    length of the text that is being searched, we just immediately set it to 0'''
    if len(strOriginal) < .2*len(strSearch):
        return 0
    else:
        match = fuzz.partial_ratio(strOriginal , textToMatch) 
        return match

    
def checkLevDistance(token_ , list_, p):
    '''
    *called by checkMissingTokens
    This function will take a token and a list, and then for each element in the list
    try to match it with the token. 
    If an element match approximately matches with the token in such a way that there is
    a partial match with a percentage less than p%, then we return that this has been found.
    '''
    tokenLen  = len(token_)
    for i in list_:
        dist = levenshtein(token_, i)
        if dist < p*(tokenLen)/100:
            return True
    return False

def checkMissingTokens(str1_Search, str2_Original):
    '''will check which words of str1_Search are absent in str2_Original.
        - But this will try to match the exact tokens. 
        - So, two words such that one is with an 's' at the end, 
            will be considered to be completely different from one another. 
        - Here, checkLevDistance has been introduced, it will say that a token
            exists in a list if the levenshtien distance is less than 10%. Ofcourse, this
            may fail for very small strings for e.g., the ones that are less than 4 or
            5 characters.
    '''
    str1 = str1_Search.split(" ")
    str2 = str2_Original.split(" ")
    missingTokens = []
    for i in str1:
        if i not in str2:
            levStatus = checkLevDistance(i, str2, 10)
            # if levStatus == True then its not a missing token
            if (not levStatus) & (isImportantToken(i)):
                missingTokens.append(i)
    str1Len= len(str1)
    missingLen = len(missingTokens)
    return missingTokens, str1Len, len(str2) , missingLen, str1Len-missingLen, \
            levenshtein(str1_Search, str2_Original)

def isImportantToken(token):
    '''check if this is an important token and not just a stopword.
    We declare a token as an important one if:
    1- its len > 5
    2- it is not a stopword
    3- is not a number (looking at the qrd template, pure numbers are not important)
    4- is not an optional token.
    '''
    if len(token)>2:
        return True
    else:
        return False
    
def extractUniqueMatches(dfmatched, N):
    '''This function will return the top N unique matches.
    '''
    dfmatched = dfmatched.sort_values(['score', 'index'], ascending = [False, True])
    uni_  = dfmatched['original'].unique()[0:N]
    ind = np.array([False]*dfmatched.shape[0])
    for i in uni_:
        ind = ind | (np.in1d(dfmatched['original'], i))
    return dfmatched.loc[ind, :]
    
    
def foundString(dfmatched, textToMatch):
    '''This function will make a call on whether a certain string has been convincingly found
        in the document. The criteria is the following:
        0- top 10 best unique matches will be picked. The focus is on 'unique'.
        1- look for a 100% match
        2- if a 100% match is not found, look at the ones that are not perfect matches
        3- check if all important tokens are in there
        4- check the place where this has been detected. what is the likelihood for this string
            to be in the location where it has been spotted.
    '''
    return True

def normalize_(series_, invert = False):
    series_ = np.array(series_)
    if not invert:
        sum_ = sum(series_)
        series_ = series_/sum_
        return series_
    else:
        series_ = 1/series_
        sum_ = sum(series_)
        series_ = series_/sum_
        return series_

    
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #print(matrix)
    return (matrix[size_x - 1, size_y - 1])

### 1. Importing Modules

In [219]:
import pandas as pd
import uuid
import json
import glob
import time
import pandas as pd
pd.options.display.max_colwidth = 200
pd.set_option("max_rows", None)
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
from htmlParsingUtils import createDomEleData, createPIJsonFromHTML
from jsonHandlingUtils import loadJSON_Convert_to_DF, mkdir, addjson
from qrdReaderUtils import * 
# from htmlParsingUtils import createDomEleData, createPIJsonFromHTML

### 2. Loading Files

In [232]:
path_data = os.path.join(os.path.abspath(os.path.join('..')), 'data', 'converted_to_html')
print(path_data)
filesHtml = [i for i in list(os.listdir(path_data)) if ('html' in i) & ('Kalydeco' in i)]
print(filesHtml)

C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html
['Abasaglar-h-2835-en.html', 'Abilify-h-471-e.html', 'AbilifyMaintena-h-2755-e.html', 'Adakveo-4874 EN PI.html', 'Adcetris-ema-combined-h-2455-en-IB-SOB-C2500.html', 'Advate-h-520-e.html', 'ADYNOVI II-03 - EN PI clea.html', 'AERIUS-H-C-0313-N-XX2-PI-en-WRM.html', 'Aimovig-PSUSA-10699-201911-PostPO-EN.html', 'Alpivab EN PI clean.html', 'Ammonaps-h-219-e.html', 'Apixaban Accord - EN annexe.html', 'Armisarte R-22 PI - clean.html', 'Arzerra II-50 & II-51 - PI - EN - clea.html', 'Betaferon II-130 - EN PI (clean) Corr.1.html', 'Comirnaty-h-5735-en.html', 'COVID-19 Vaccine AstraZeneca.html', 'COVID-19 Vaccine Moderna 5791 - Product Information final.html', 'CRYSVITA II-10-G - EN annexes clean.html', 'ELOCTA II-39 - EN PI clean.html', 'Epclusa-h-4210-e.html', 'Erleada-II-009 EN PI Clea.html', 'Exjade-II-073-EN PI.html', 'Eylea_IB_007.html', 'Febuxostat Mylan not most recent.html', 'Gardasil II-080-PI-en-WR.html', 'Hepser.h

#### 2.1 Converting HTML Files to JSON

In [236]:
path_json = os.path.join(os.path.abspath(os.path.join('..')), 'data', 'outputJSON')
mkdir(path_json, _del = True)
for input_filename in [os.path.join(path_data, i) for i in filesHtml]:
    output_filename = input_filename.replace('converted_to_html','json')
    output_filename = output_filename.replace('.html','.json')
    output_filename = os.path.join(path_json, os.path.basename(output_filename) )
    print(input_filename, "|||" , output_filename)
    createPIJsonFromHTML(input_filename,output_filename)

C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Abasaglar-h-2835-en.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Abasaglar-h-2835-en.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Abilify-h-471-e.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Abilify-h-471-e.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\AbilifyMaintena-h-2755-e.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\AbilifyMaintena-h-2755-e.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Adakveo-4874 EN PI.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Adakveo-4874 EN PI.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Adcetris-ema-combined-h-2455-en-IB-SOB-C2500.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Adcetris-ema-combined-h-2455-en-IB-SOB-C2500.

C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Myalept.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Myalept.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\NexoBrid.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\NexoBrid.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Ninlaro II-25 EN PI clea.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Ninlaro II-25 EN PI clea.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Nivolumab BMS-3840 - EN PI clea.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Nivolumab BMS-3840 - EN PI clea.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\Norvir.html ||| C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Norvir.json
C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\converted_to_html\OPDIVO-H-39

#### 2.2 Picking a JSON and getting a Pandas DF

In [237]:
path_json = os.path.join(os.path.abspath(os.path.join('..')), 'data', 'outputJSON')
output_filename = os.path.join( path_json , 'Kalydeco II-86-PI-clean.json')
print('File being processed: ' + output_filename)
df = pd.DataFrame(loadJSON_Convert_to_DF(output_filename))
print(df.shape)
display(df.head(5))

print('***************************************************************************************************')
print('*************************** Texts with more than 2 characters**************************************')
def lenCheck(str):
    Threshold = 2
    return len(str)>Threshold
ind = df['Text'].apply(lambda x: lenCheck(x))
display(df.loc[ind,:].head(100))

File being processed: C:\Users\psaga\source\repos\EMA%20EPI%20PoC\code\data\outputJSON\Kalydeco II-86-PI-clean.json
(11333, 6)


Unnamed: 0,Element,ID,Styles,Classes,Text,ParentId
0,"<div class=""WordSection1"">\n<p align=""center"" class=""MsoNormal"" style=""text-align:center;line-height:normal""><b><span lang=""EN-GB""> </span></b></p>\n<p align=""center"" class=""MsoNormal"" style=""text...",d015cc91-5bca-4c9b-bbc2-962b9d425def,,['WordSection1'],...,04f57b24-69b6-48fa-880f-5210cef75f15
1,"<p align=""center"" class=""MsoNormal"" style=""text-align:center;line-height:normal""><b><span lang=""EN-GB""> </span></b></p>",026ac808-654a-4951-902a-46ac50b55556,text-align:center;line-height:normal,['MsoNormal'],,d015cc91-5bca-4c9b-bbc2-962b9d425def
2,"<b><span lang=""EN-GB""> </span></b>",563aa208-56e8-4632-9656-34dc3f951645,,,,026ac808-654a-4951-902a-46ac50b55556
3,"<span lang=""EN-GB""> </span>",b5c0b158-5648-47cd-bdcc-234548ccc5a6,,,,563aa208-56e8-4632-9656-34dc3f951645
4,"<p align=""center"" class=""MsoNormal"" style=""text-align:center;line-height:normal""><b><span lang=""EN-GB""> </span></b></p>",e08d509d-e0f7-4a21-bc55-1a058206ad01,text-align:center;line-height:normal,['MsoNormal'],,d015cc91-5bca-4c9b-bbc2-962b9d425def


***************************************************************************************************
*************************** Texts with more than 2 characters**************************************


Unnamed: 0,Element,ID,Styles,Classes,Text,ParentId
0,"<div class=""WordSection1"">\n<p align=""center"" class=""MsoNormal"" style=""text-align:center;line-height:normal""><b><span lang=""EN-GB""> </span></b></p>\n<p align=""center"" class=""MsoNormal"" style=""text...",d015cc91-5bca-4c9b-bbc2-962b9d425def,,['WordSection1'],...,04f57b24-69b6-48fa-880f-5210cef75f15
72,"<span lang=""EN-GB"">ANNEX I</span>",2cf65c61-a3bd-4a81-bad9-998735a9a03b,,,ANNEX I,090a4949-c5c6-4b66-aad7-aca191cf2d67
76,"<span lang=""EN-GB"">SUMMARY OF PRODUCT CHARACTERISTICS</span>",5a6ccb65-d764-4985-9ce2-6bea2c7e1c39,,,SUMMARY OF PRODUCT CHARACTERISTICS,efa7d380-16a5-4a5a-a062-f7d1f41a2486
87,"<span lang=""EN-GB"" style='font-size:11.0pt;font-family:\n""Times New Roman"",serif'>1. NAME OF THE MEDICINAL PRODUCT</span>",ac6f0087-5507-4991-9ece-b02c8aad6ccd,"font-size:11.0pt;font-family:\n""Times New Roman"",serif",,1. NAME OF THE MEDICINAL PRODUCT,b36371d8-151b-424b-b507-b4c1703a518c
91,"<span lang=""EN-GB"">Kalydeco 150 mg film‑coated tablets</span>",492011b4-4117-47ed-86b3-4f0f22f404d1,,,Kalydeco 150 mg film‑coated tablets,3bd60d40-d8c2-4308-a5bc-bedb154242fb
97,"<span lang=""EN-GB"" style='font-size:\n11.0pt;font-family:""Times New Roman"",serif;color:black'>2. QUALITATIVE\nAND QUANTITATIVE COMPOSITION</span>",13f5e4f5-30f0-4460-9013-0734b40e44f2,"font-size:\n11.0pt;font-family:""Times New Roman"",serif;color:black",,2. QUALITATIVE AND QUANTITATIVE COMPOSITION,e08228e2-0cbb-4e0c-9b2a-d84c8b41418b
101,"<span lang=""EN-GB"">Each film‑coated tablet contains 150 mg of ivacaftor.</span>",4fd3cfe1-9383-42e6-843e-ef67cdb439b7,,,Each film‑coated tablet contains 150 mg of ivacaftor.,76f98e5e-467e-4bd9-9004-a698c2ce9703
108,"<span lang=""EN-GB"">Excipient with known effect</span>",338bae81-d209-4b5e-b82a-a81ac4e27cb3,,,Excipient with known effect,2d8b9a48-5186-4db6-a49a-312a31b4945b
112,"<span lang=""EN-GB"">Each film‑coated tablet contains 167.2 mg of lactose\nmonohydrate.</span>",3bca705e-9461-4748-8de8-4059ffaf9b33,,,Each film‑coated tablet contains 167.2 mg of lactose monohydrate.,0b23e123-cdca-4868-812a-8b353a6db60e
116,"<span lang=""EN-GB"">For the full list of excipients, see section 6.1.</span>",9c2e0743-a751-4425-a667-dfb16a2b9d3a,,,"For the full list of excipients, see section 6.1.",1fca8b02-aa83-4d0c-b6a6-c56e5a59a077


#### 2.3 Preprocessing Texts

In [238]:
import re
import unicodedata

def preprocessStr(str_):
    str_ = re.sub('[^A-Za-z0-9.-]+', ' ', str_)
    str_ = unicodedata.normalize("NFKD",str_)
    return str_#.lower()

### 3. Using the QRD template CSV to Match Strings

In [239]:
filePath = os.path.join(os.path.abspath(os.path.join('..')), 'data', 'control')
fileName = r'qrd_canonical_model_multilingual.csv'

filePathQRD = os.path.join(filePath, fileName)

dfCanonicalModel = pd.read_csv(filePathQRD)
colsofInterest  = ['id', 'Procedure type', 'Document type', 'Language code',
   'Display code', 'Name', 'parent_id', 'Mandatory']
dfCanonicalModel = dfCanonicalModel[colsofInterest]
display(dfCanonicalModel)


Unnamed: 0,id,Procedure type,Document type,Language code,Display code,Name,parent_id,Mandatory
0,1,CAP,SmPC,bg,,КРАТКА ХАРАКТЕРИСТИКА НА ПРОДУКТА,,True
1,2,CAP,SmPC,bg,,qТози лекарствен продукт подлежи на допълнително наблюдение. Това ще позволи бързото установяване на нова информация относно безопасността. От медицинските специалисти се изисква да съобщават всяк...,1.0,False
2,3,CAP,SmPC,bg,1,ИМЕ НА ЛЕКАРСТВЕНИЯ ПРОДУКТ,1.0,True
3,4,CAP,SmPC,bg,2,КАЧЕСТВЕН И КОЛИЧЕСТВЕН СЪСТАВ,1.0,True
4,5,CAP,SmPC,bg,2.1,Общо описание,4.0,False
5,6,CAP,SmPC,bg,2.2,Качествен и количествен състав,4.0,False
6,7,CAP,SmPC,bg,,Помощно(и) вещество(а) с известно действие,6.0,False
7,8,CAP,SmPC,bg,3,ЛЕКАРСТВЕНА ФОРМА,1.0,True
8,9,CAP,SmPC,bg,4,КЛИНИЧНИ ДАННИ,1.0,True
9,10,CAP,SmPC,bg,4.1,Терапевтични показания,9.0,True


#### Getting String Lengths in HtML

In [240]:
def getTrueLength(str_):
    return len(preprocessStr(str_))

df['StringLength']  = df['Text'].apply(lambda x: getTrueLength(x))

ind = (df['StringLength']>5) & (df['StringLength']<200)
print('Number of string lengths greater than 5 and less than 200: ' , sum(ind))

Number of string lengths greater than 5 and less than 200:  2247


#### Filtering QRD file for SmPC

In [241]:
ind = (dfCanonicalModel['Procedure type'] == 'CAP') & \
        (dfCanonicalModel['Document type'] == 'SmPC') & \
        (dfCanonicalModel['Language code'] == 'en')

print(sum(ind))
dfModelwRulesF = dfCanonicalModel.loc[ind, :].reset_index(drop = False)


60


### Matching

In [242]:
%%time
collection_ = {}

def storeResults(collection_ , qrd_str_row, str_, indexDF ):
    collection_ = addjson(collection_ , 'index', qrd_str_row['index'])
    collection_ = addjson(collection_ , 'id', qrd_str_row['id'])
    collection_ = addjson(collection_ , 'Procedure type', qrd_str_row['Procedure type'])
    collection_ = addjson(collection_ , 'Display code', qrd_str_row['Display code'])
    collection_ = addjson(collection_ , 'Name', qrd_str_row['Name'])
    collection_ = addjson(collection_ , 'parent_id', qrd_str_row['parent_id'])
    collection_ = addjson(collection_ , 'htmlText', str_['Text'])
    collection_ = addjson(collection_ , 'htmlIndex', indexDF)
    return collection_

'''Initiate matching. Pick a string from an HTML and match it to QRD'''

for  indexDF , str_ in df.loc[0:6000, :].iterrows():
    
    if (str_['StringLength']>5) & (str_['StringLength']<100):
        str_processed = preprocessStr(str_['Text'])
        if len(str_processed)>5:
            #we perform a check only if the length is greater than 5
            found_vec = []
            for _ , qrd_str_row in dfModelwRulesF.iterrows():
                qrd_str = qrd_str_row['Name']
                qrd_str_processed = preprocessStr(qrd_str)
                if len(str_processed) < 2*len(qrd_str_processed):
                    
                    '''Will only perform processing over texts that are within reasonable
                    length compared to the qrd string.'''
                    dist = levenshtein(str_processed, qrd_str_processed)
                    perc = round(dist*100/len(str_processed), 2)
                    if perc<=20:
                        print(perc, ' || ',str_['Text'], ' || ' , qrd_str)
                        found_vec.append(qrd_str_row)
                        collection_ = storeResults(collection_ , qrd_str_row, str_, indexDF )
            if len(found_vec)>1:
                print('******************************************************************')
                print('found_vec length: ', len(found_vec))

0.0  ||  SUMMARY OF PRODUCT CHARACTERISTICS  ||  SUMMARY OF PRODUCT CHARACTERISTICS
9.38  ||  1.       NAME OF THE MEDICINAL PRODUCT  ||  NAME OF THE MEDICINAL PRODUCT
6.98  ||  2.       QUALITATIVE AND QUANTITATIVE COMPOSITION  ||  QUALITATIVE AND QUANTITATIVE COMPOSITION
7.41  ||  Excipient with known effect  ||  Excipient(s) with known effect
13.64  ||  3.       PHARMACEUTICAL FORM  ||  PHARMACEUTICAL FORM
0.0  ||  CLINICAL PARTICULARS  ||  CLINICAL PARTICULARS
14.81  ||  4.1     Therapeutic indications  ||  Therapeutic indications
9.76  ||  4.2     Posology and method of administration  ||  Posology and method of administration
0.0  ||  Posology  ||  Posology
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Pa

10.81  ||  4.6       Fertility, pregnancy and lactation  ||  Fertility, pregnancy and lactation
0.0  ||  Pregnancy  ||  Pregnancy
7.14  ||  Breast‑feeding  ||  Breast-feeding
0.0  ||  Fertility  ||  Fertility
8.33  ||  4.7     Effects on ability to drive and use machines  ||  Effects on ability to drive and use machines
17.39  ||  4.8     Undesirable effects  ||  Undesirable effects
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
0.0  ||  Paediatric population  ||  Paediatric population
******************************************************************
found_vec length:  6
0.0  ||  Reporting of suspected adverse reactions  ||  Reporting of suspected adverse reactions
10.34  ||  5.       PHARMACOLOGICAL PROPERTIES  ||  PHARMACOLOGICAL PROPERTIES
0.0

In [243]:
dfExtractedHier = pd.DataFrame(collection_)
print(dfExtractedHier.shape)
def convertToInt(x):
    try:
        return str(int(x))
    except:
        return x
    
dfExtractedHier['parent_id'] = dfExtractedHier['parent_id'].apply(lambda x: convertToInt(x))
dfExtractedHier['id'] = dfExtractedHier['id'].apply(lambda x: convertToInt(x))
dfExtractedHier.to_csv('dfExtractedHier.csv')
dfExtractedHier.head(100)

(147, 8)


Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
0,952,28001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY OF PRODUCT CHARACTERISTICS,76
1,954,28003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,28001.0,1. NAME OF THE MEDICINAL PRODUCT,87
2,955,28004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,28001.0,2. QUALITATIVE AND QUANTITATIVE COMPOSITION,97
3,958,28007,CAP,,Excipient(s) with known effect,28006.0,Excipient with known effect,108
4,959,28008,CAP,3.0,PHARMACEUTICAL FORM,28001.0,3. PHARMACEUTICAL FORM,122
5,960,28009,CAP,4.0,CLINICAL PARTICULARS,28001.0,CLINICAL PARTICULARS,138
6,961,28010,CAP,4.1,Therapeutic indications,28009.0,4.1 Therapeutic indications,143
7,962,28011,CAP,4.2,Posology and method of administration,28009.0,4.2 Posology and method of administration,185
8,963,28012,CAP,,Posology,28011.0,Posology,197
9,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,587


In [244]:
dfExtractedHier = pd.read_csv('dfExtractedHier.csv')
dfExtractedHier = dfExtractedHier.drop(['Unnamed: 0'], axis=1)

### Resolving the case of Repititive Matches

In [245]:
''' Lets check if the source text is being repeated '''

df_temp = dfExtractedHier['htmlIndex'].value_counts().reset_index(drop = False).rename(columns = \
         {'htmlIndex': 'count', 'index': 'htmlIndex'}).sort_values(['htmlIndex'], ascending = True)
ind = df_temp['count']>1
repetitiveIndexes = df_temp.loc[ind, :]
display(repetitiveIndexes)

Unnamed: 0,htmlIndex,count
3,587,6
10,787,6
4,964,6
6,1489,6
12,1663,2
9,2732,6
2,2908,6
5,3648,6
1,3813,6
7,3989,6


#### Resolve Many to 1 Matches

In [246]:
dfExtractedHierRR = dfExtractedHier.copy()


for i in repetitiveIndexes['htmlIndex'][0:2]:
    print(i)
    ind = dfExtractedHierRR['htmlIndex'] == i
    lastInd = dfExtractedHierRR.loc[ind,:].index[-1]
    dfInterest = dfExtractedHierRR.loc[ind,:]
    lastPreIndex = dfInterest.index[0]
    dfInterestPre = dfExtractedHierRR.loc[0:lastPreIndex, :]
    display(dfInterestPre)
    lastFound = False
    for j in np.flip(dfInterest.index):
        parent_id_considered = dfInterest.loc[j , 'parent_id']
        if (parent_id_considered in list(dfInterestPre['id'])) & (lastFound == False):
            print(j, parent_id_considered , True)
            lastFound  = True
        else:
            lastFound = False
            dfExtractedHierRR = dfExtractedHierRR.drop(j)
            print(j, parent_id_considered ,False)
    display(dfInterest)

587


Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
0,952,28001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY OF PRODUCT CHARACTERISTICS,76
1,954,28003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,28001.0,1. NAME OF THE MEDICINAL PRODUCT,87
2,955,28004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,28001.0,2. QUALITATIVE AND QUANTITATIVE COMPOSITION,97
3,958,28007,CAP,,Excipient(s) with known effect,28006.0,Excipient with known effect,108
4,959,28008,CAP,3.0,PHARMACEUTICAL FORM,28001.0,3. PHARMACEUTICAL FORM,122
5,960,28009,CAP,4.0,CLINICAL PARTICULARS,28001.0,CLINICAL PARTICULARS,138
6,961,28010,CAP,4.1,Therapeutic indications,28009.0,4.1 Therapeutic indications,143
7,962,28011,CAP,4.2,Posology and method of administration,28009.0,4.2 Posology and method of administration,185
8,963,28012,CAP,,Posology,28011.0,Posology,197
9,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,587


14 28033.0 False
13 28030.0 False
12 28027.0 False
11 28020.0 False
10 28017.0 False
9 28012.0 True


Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
9,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,587
10,970,28019,CAP,,Paediatric population,28017.0,Paediatric population,587
11,972,28021,CAP,,Paediatric population,28020.0,Paediatric population,587
12,979,28028,CAP,,Paediatric population,28027.0,Paediatric population,587
13,982,28031,CAP,,Paediatric population,28030.0,Paediatric population,587
14,988,28037,CAP,,Paediatric population,28033.0,Paediatric population,587


787


Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
0,952,28001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY OF PRODUCT CHARACTERISTICS,76
1,954,28003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,28001.0,1. NAME OF THE MEDICINAL PRODUCT,87
2,955,28004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,28001.0,2. QUALITATIVE AND QUANTITATIVE COMPOSITION,97
3,958,28007,CAP,,Excipient(s) with known effect,28006.0,Excipient with known effect,108
4,959,28008,CAP,3.0,PHARMACEUTICAL FORM,28001.0,3. PHARMACEUTICAL FORM,122
5,960,28009,CAP,4.0,CLINICAL PARTICULARS,28001.0,CLINICAL PARTICULARS,138
6,961,28010,CAP,4.1,Therapeutic indications,28009.0,4.1 Therapeutic indications,143
7,962,28011,CAP,4.2,Posology and method of administration,28009.0,4.2 Posology and method of administration,185
8,963,28012,CAP,,Posology,28011.0,Posology,197
9,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,587


23 28033.0 False
22 28030.0 False
21 28027.0 False
20 28020.0 False
19 28017.0 True
18 28012.0 False


Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
18,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,787
19,970,28019,CAP,,Paediatric population,28017.0,Paediatric population,787
20,972,28021,CAP,,Paediatric population,28020.0,Paediatric population,787
21,979,28028,CAP,,Paediatric population,28027.0,Paediatric population,787
22,982,28031,CAP,,Paediatric population,28030.0,Paediatric population,787
23,988,28037,CAP,,Paediatric population,28033.0,Paediatric population,787


In [247]:
dfExtractedHier

Unnamed: 0,index,id,Procedure type,Display code,Name,parent_id,htmlText,htmlIndex
0,952,28001,CAP,,SUMMARY OF PRODUCT CHARACTERISTICS,,SUMMARY OF PRODUCT CHARACTERISTICS,76
1,954,28003,CAP,1.0,NAME OF THE MEDICINAL PRODUCT,28001.0,1. NAME OF THE MEDICINAL PRODUCT,87
2,955,28004,CAP,2.0,QUALITATIVE AND QUANTITATIVE COMPOSITION,28001.0,2. QUALITATIVE AND QUANTITATIVE COMPOSITION,97
3,958,28007,CAP,,Excipient(s) with known effect,28006.0,Excipient with known effect,108
4,959,28008,CAP,3.0,PHARMACEUTICAL FORM,28001.0,3. PHARMACEUTICAL FORM,122
5,960,28009,CAP,4.0,CLINICAL PARTICULARS,28001.0,CLINICAL PARTICULARS,138
6,961,28010,CAP,4.1,Therapeutic indications,28009.0,4.1 Therapeutic indications,143
7,962,28011,CAP,4.2,Posology and method of administration,28009.0,4.2 Posology and method of administration,185
8,963,28012,CAP,,Posology,28011.0,Posology,197
9,964,28013,CAP,,Paediatric population,28012.0,Paediatric population,587
