In [2]:
import os
import re
import json
import shutil
import textract
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
import subprocess

In [33]:
!mkdir data
!mkdir pdfs
!gdown 1Xgb7GwakWa_wEiOAeiFz-sUg8Uo0NO3F -O data/tree.xlsx
!rm -r pdfs

mkdir: data: File exists


Downloading...
From: https://drive.google.com/uc?id=1Xgb7GwakWa_wEiOAeiFz-sUg8Uo0NO3F
To: /Users/AliTarik/OntoGPT_FHC_EFSA/data/tree.xlsx
100%|████████████████████████████████████████| 541k/541k [00:00<00:00, 19.8MB/s]


In [129]:
claims = pd.read_excel('data/tree.xlsx')

In [130]:
claims = claims.drop(index=0)

In [131]:
columns_to_keep = ['EFSA Opinion Reference','Food',
       'Target population',
       'Phenotype',
       'Supporting Evidence Reference 1',
       'Supporting Evidence Reference 2',
       'Supporting Evidence Reference 3',
       'Supporting Evidence Reference 4',
       'Supporting Evidence Reference',
       'Supporting Evidence Reference 5',
       'Supporting Evidence Reference 6',
       'Supporting Evidence Reference 7',
       'Supporting Evidence Reference 8',
       'Supporting Evidence Reference 9']
claims = claims[columns_to_keep]

In [132]:
claims.shape

(236, 14)

In [133]:
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,Supporting Evidence Reference 1,Supporting Evidence Reference 2,Supporting Evidence Reference 3,Supporting Evidence Reference 4,Supporting Evidence Reference,Supporting Evidence Reference 5,Supporting Evidence Reference 6,Supporting Evidence Reference 7,Supporting Evidence Reference 8,Supporting Evidence Reference 9
1,"2009;7(9):1252, 2011;9(6):2203",Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...,,,,,,,,,
2,2011;9(4):2049,Activated charcoal,General population,Excessive flatulence,Jain et al. (1986a): PMID 3521259,Jain et al (1986b): 10.7326/0003-4819-105-1-61,Hall et al (1981): PMID 7015846,,,,,,,
3,2011;9(6):2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001): 10.1093/ajcn/73.2.415s,,,,,,,,,
4,"2009;7(9):1254, 2011;9(6):2207",Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,Ripsin et al. (1992): 10.1001/jama.1992.034802...,Newman et al. (1989): no DOI ; McIntosh et al....,,,,,,,,
5,2011;9(4):2052,Betaine,General population,Normal blood homocysteine,Schwab et al. (2002): 10.1093/ajcn/76.5.961,Olthof et al. (2003): 10.1093/jn/133.12.4135,Schwab et al. (2006): 10.1093/jn/136.1.34,,,,,,,


In [134]:
last_10 = claims.iloc[:,-10:]
claims['References'] = last_10.apply(lambda row: '|'.join(row.dropna().values.astype(str)), axis=1)
claims = claims.drop(claims.columns[-11:-1], axis=1)
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].astype(str)
claims['Food'] = claims['Food'].astype(str)
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,References
1,"2009;7(9):1252, 2011;9(6):2203",Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...
2,2011;9(4):2049,Activated charcoal,General population,Excessive flatulence,Jain et al. (1986a): PMID 3521259|Jain et al (...
3,2011;9(6):2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001): 10.1093/ajcn/73.2.415s
4,"2009;7(9):1254, 2011;9(6):2207",Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,Ripsin et al. (1992): 10.1001/jama.1992.034802...
5,2011;9(4):2052,Betaine,General population,Normal blood homocysteine,Schwab et al. (2002): 10.1093/ajcn/76.5.961|Ol...


In [135]:
claims.shape

(236, 5)

In [136]:
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].str.split(',')
claims = claims.explode('EFSA Opinion Reference')

In [137]:
claims.shape

(304, 5)

In [138]:
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,References
1,2009;7(9):1252,Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...
1,2011;9(6):2203,Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...
2,2011;9(4):2049,Activated charcoal,General population,Excessive flatulence,Jain et al. (1986a): PMID 3521259|Jain et al (...
3,2011;9(6):2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001): 10.1093/ajcn/73.2.415s
4,2009;7(9):1254,Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,Ripsin et al. (1992): 10.1001/jama.1992.034802...


In [139]:
def process_efsa_reference(ref):
    parts = ref.split(';')
    if len(parts) < 2:
        return ref
    first_part = parts[0]
    second_parts = parts[1].split(':')
    if len(second_parts) < 2:
        return ref
    second_part = second_parts[1]
    return f"{first_part}_{second_part}"

def process_references(ref):
    citations = []
    dois = []
    for group in ref.split('|'):
        for part in group.split(';'):
            subparts = part.split(':')
            if len(subparts) == 2:
                citations.append(subparts[0].strip())
                dois.append(subparts[1].strip())
    return pd.Series([','.join(citations), ','.join(dois)])
def remove_parentheses(text):
    return pd.Series(text).str.replace(r'\(.*?\)', '', regex=True).str.strip()

In [140]:
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].apply(process_efsa_reference)
claims[['Citations', 'DOI']] = claims['References'].apply(process_references)
claims = claims.drop(columns=['References'])
claims['Food'] = claims['Food'].apply(remove_parentheses)


In [141]:
claims.shape

(304, 6)

In [142]:
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,Citations,DOI
1,2009_1252,Alpha-linolenic acid,General population,Normal blood cholesterol,"Mensink et al. (2003),Mantzioris et al. (1994)...","10.1093/ajcn/77.5.1146,10.1093/ajcn/59.6.1304,..."
1,2011_2203,Alpha-linolenic acid,General population,Normal blood cholesterol,"Mensink et al. (2003),Mantzioris et al. (1994)...","10.1093/ajcn/77.5.1146,10.1093/ajcn/59.6.1304,..."
2,2011_2049,Activated charcoal,General population,Excessive flatulence,"Jain et al. (1986a),Jain et al (1986b),Hall et...","PMID 3521259,10.7326/0003-4819-105-1-61,PMID 7..."
3,2011_2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001),10.1093/ajcn/73.2.415s
4,2009_1254,Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,"Ripsin et al. (1992),Brown et al. (1999),Karma...","10.1001/jama.1992.03480240079039,10.1093/ajcn/..."


In [149]:
sum(claims["EFSA Opinion Reference"]=="")

1

In [143]:

base_directory = "path_to/EFSA_DOCUMENTATION"
grouped = claims.groupby('EFSA Opinion Reference')

for efsa_ref, group in grouped:
    folder_path = os.path.join(base_directory, efsa_ref)

    if not os.path.exists(folder_path):
        print(f"Directory {folder_path} does not exist. Skipping {efsa_ref}.")
        continue
    

    entries = []
    
    for index, row in group.iterrows():
        entry = {
            "Food": row["Food"],
            "Target_population": row["Target population"],
            "Phenotype": row["Phenotype"],
            "Citations": row["Citations"],
            "DOI": row["DOI"]
        }
        entries.append(entry)
    
    json_str = json.dumps(entries, indent=4)
    
    file_path = os.path.join(folder_path, "data.json")
    
    with open(file_path, "w") as json_file:
        json_file.write(json_str)

Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2009_1272 does not exist. Skipping  2009_1272.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1725 does not exist. Skipping  2010_1725.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1727 does not exist. Skipping  2010_1727.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1728 does not exist. Skipping  2010_1728.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1740 does not exist. Skipping  2010_1740.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1754 does not exist. Skipping  2010_1754.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1756 does not exist. Skipping  2010_1756.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1757 does not exist. Skipping  2010_1757.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1758 does not exist. Skipping  2010_1758.
Directory /Users/AliTarik/Documents/EFSA_DOCUMENTATION/ 2010_1760 does no