In [2]:
import os
import re
import json
import shutil
import textract
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
import subprocess

In [33]:
!mkdir data
!mkdir pdfs
!gdown 1Xgb7GwakWa_wEiOAeiFz-sUg8Uo0NO3F -O data/tree.xlsx
!rm -r pdfs

mkdir: data: File exists


Downloading...
From: https://drive.google.com/uc?id=1Xgb7GwakWa_wEiOAeiFz-sUg8Uo0NO3F
To: /Users/AliTarik/OntoGPT_FHC_EFSA/data/tree.xlsx
100%|████████████████████████████████████████| 541k/541k [00:00<00:00, 19.8MB/s]


In [69]:
claims = pd.read_excel('data/tree.xlsx')

In [70]:
claims = claims.drop(index=0)

In [71]:
columns_to_keep = ['EFSA Opinion Reference','Food',
       'Target population',
       'Phenotype',
       'Supporting Evidence Reference 1',
       'Supporting Evidence Reference 2',
       'Supporting Evidence Reference 3',
       'Supporting Evidence Reference 4',
       'Supporting Evidence Reference',
       'Supporting Evidence Reference 5',
       'Supporting Evidence Reference 6',
       'Supporting Evidence Reference 7',
       'Supporting Evidence Reference 8',
       'Supporting Evidence Reference 9']
claims = claims[columns_to_keep]

In [72]:
last_10 = claims.iloc[:,-10:]
claims['References'] = last_10.apply(lambda row: '|'.join(row.dropna().values.astype(str)), axis=1)
claims = claims.drop(claims.columns[-11:-1], axis=1)
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].astype(str)
claims['Food'] = claims['Food'].astype(str)

In [73]:
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].str.split(',')
claims = claims.explode('EFSA Opinion Reference')

In [74]:
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,References
1,2009;7(9):1252,Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...
1,2011;9(6):2203,Alpha-linolenic acid (ALA),General population,Normal blood cholesterol,Mensink et al. (2003): 10.1093/ajcn/77.5.1146;...
2,2011;9(4):2049,Activated charcoal,General population,Excessive flatulence,Jain et al. (1986a): PMID 3521259|Jain et al (...
3,2011;9(6):2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001): 10.1093/ajcn/73.2.415s
4,2009;7(9):1254,Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,Ripsin et al. (1992): 10.1001/jama.1992.034802...


In [75]:
def process_efsa_reference(ref):
    parts = ref.split(';')
    if len(parts) < 2:
        return ref
    first_part = parts[0]
    second_parts = parts[1].split(':')
    if len(second_parts) < 2:
        return ref
    second_part = second_parts[1]
    return f"{first_part}_{second_part}"

def process_references(ref):
    citations = []
    dois = []
    for part in ref.split(';'):
        subparts = part.split(':')
        if len(subparts) == 2:
            citations.append(subparts[0].strip())
            dois.append(subparts[1].strip())
    return pd.Series([','.join(citations), ','.join(dois)])
def remove_parentheses(text):
    return pd.Series(text).str.replace(r'\(.*?\)', '', regex=True).str.strip()

In [76]:
claims['EFSA Opinion Reference'] = claims['EFSA Opinion Reference'].apply(process_efsa_reference)
claims[['Citations', 'DOI']] = claims['References'].apply(process_references)
claims = claims.drop(columns=['References'])
claims['Food'] = claims['Food'].apply(remove_parentheses)
claims.head()

Unnamed: 0,EFSA Opinion Reference,Food,Target population,Phenotype,Citations,DOI
1,2009_1252,Alpha-linolenic acid,General population,Normal blood cholesterol,"Mensink et al. (2003),Mantzioris et al. (1994)...","10.1093/ajcn/77.5.1146,10.1093/ajcn/59.6.1304,..."
1,2011_2203,Alpha-linolenic acid,General population,Normal blood cholesterol,"Mensink et al. (2003),Mantzioris et al. (1994)...","10.1093/ajcn/77.5.1146,10.1093/ajcn/59.6.1304,..."
2,2011_2049,Activated charcoal,General population,Excessive flatulence,,
3,2011_2249,Barley grain fibre,General population,Increase in faecal bulk,Cummings et al. (2001),10.1093/ajcn/73.2.415s
4,2009_1254,Beta-glucans,Adults with normal or mildly elevated blood ch...,Normal blood cholesterol,"Ripsin et al. (1992),Brown et al. (1999),Karma...","10.1001/jama.1992.03480240079039,10.1093/ajcn/..."


In [79]:
json_dict = {}
for index, row in claims.iterrows():
    key = row["EFSA Opinion Reference"]
    json_dict[key] = {
        "Food": row["Food"],
        "Target_population": row["Target population"],
        "Phenotype": row["Phenotype"],
        "Citations": row["Citations"],
        "DOI": row["DOI"]
    }


json_str = json.dumps(json_dict, indent=4)

directory = "/Users/AliTarik/OntoGPT_FHC_EFSA"
file_name = "data.json"
file_path = os.path.join(directory, file_name)
os.makedirs(directory, exist_ok=True)

with open(file_path, "w") as json_file:
    json_file.write(json_str)
