In [None]:
import pyarrow as pa
import pyarrow.json as paj
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pandas as pd
import json
import numpy as np
from IPython.core.display import display, HTML

json_file_path = r"C:\Users\bramd_finhsgu\OneDrive - UGent\Thesis\Thesis_bestanden\__MACOSX\foodb_2020_04_07_json\Content.json"
parquet_file_path = json_file_path.replace(".json", ".parquet")
food_ids = [334, 358, 483, 506, 549] #334 chicken, 358 mallard duck, 483 mutton, 506 beef, 541 domesticated pig
food_ids_arrow = pa.array(food_ids, type=table.column("food_id").type)
columns_to_keep = [
    "id", "food_id", "orig_food_id", "orig_food_common_name",
    "source_id", "orig_source_id", "orig_source_name",
    "source_type", "orig_content", "orig_unit"
]

print("Converting JSON to Parquet (this may take a moment)...")
table = paj.read_json(json_file_path)
pq.write_table(table, parquet_file_path, compression="snappy")

print("Filtering data...")
table = pq.read_table(parquet_file_path)
filtered_table = table.filter(pc.is_in(table["food_id"], value_set=food_ids_arrow))
foodb_meat = filtered_table.select(columns_to_keep).to_pandas()

  from IPython.core.display import display, HTML


Converting JSON to Parquet (this may take a moment)...
Filtering data...

Aantal rijen per food_id:
Voor food_id 334 zijn er 51588 rijen in de dataset.
Voor food_id 358 zijn er 49873 rijen in de dataset.
Voor food_id 483 zijn er 51102 rijen in de dataset.
Voor food_id 506 zijn er 138620 rijen in de dataset.
Voor food_id 549 zijn er 101966 rijen in de dataset.


In [57]:
foodb_meat_filtered = foodb_meat.dropna(subset=["orig_content"]).reset_index(drop=True)

counts_before = foodb_meat["food_id"].value_counts().sort_index()
counts_after = foodb_meat_filtered["food_id"].value_counts().sort_index()

print("\nAantal rijen per food_id vóór en na filtering:")
for food_id in sorted(set(counts_before.index) | set(counts_after.index)):  
    count_before = counts_before.get(food_id, 0)
    count_after = counts_after.get(food_id, 0)
    percentage_remaining = (count_after / count_before * 100) if count_before > 0 else 0
    print(f"Food ID {food_id}: {count_before} rijen initieel → {count_after} rijen na filtering ({percentage_remaining:.2f}%)")

print(f"\nIn totaal waren er initieel {len(foodb_meat)} rijen, maar na filtering blijven er slechts {len(foodb_meat_filtered)} over. ({(len(foodb_meat_filtered) / len(foodb_meat) * 100):.2f}% resterend)\n")

def calculate_missing_matrix(df):
    missing_matrix = df.groupby("food_id").apply(lambda x: x.isnull().mean() * 100).T
    hmdb_percentage = (
        df.groupby("food_id")["orig_food_common_name"]
        .apply(lambda x: (x == "Endogenous compounds from human (HMDB)").mean() * 100)
    )
    missing_matrix.loc["% HMDB aanwezigheid"] = hmdb_percentage
    return missing_matrix

missing_matrix_before = calculate_missing_matrix(foodb_meat)
missing_matrix_after = calculate_missing_matrix(foodb_meat_filtered)

comparison_matrix = pd.concat([missing_matrix_before, missing_matrix_after], axis=1, keys=["Vóór filtering", "Na filtering"])

pd.set_option("display.width", 200)  # Verhoogt de breedte van de output
pd.set_option("display.max_columns", None)  # Laat alle kolommen zien
print("\nMatrix van ontbrekende waarden en HMDB-aanwezigheid per food_id (%):")
print(comparison_matrix)



Aantal rijen per food_id vóór en na filtering:
Food ID 334: 51588 rijen initieel → 1715 rijen na filtering (3.32%)
Food ID 358: 49873 rijen initieel → 0 rijen na filtering (0.00%)
Food ID 483: 51102 rijen initieel → 1229 rijen na filtering (2.40%)
Food ID 506: 138620 rijen initieel → 88746 rijen na filtering (64.02%)
Food ID 549: 101966 rijen initieel → 52093 rijen na filtering (51.09%)

In totaal waren er initieel 393149 rijen, maar na filtering blijven er slechts 143783 over. (36.57% resterend)


Matrix van ontbrekende waarden en HMDB-aanwezigheid per food_id (%):
                      Vóór filtering                                          Na filtering                                 
food_id                          334    358         483        506        549          334         483       506        549
id                          0.000000    0.0    0.000000   0.000000   0.000000     0.000000    0.000000  0.000000   0.000000
food_id                     0.000000    0.0    0.00000

  missing_matrix = df.groupby("food_id").apply(lambda x: x.isnull().mean() * 100).T
  missing_matrix = df.groupby("food_id").apply(lambda x: x.isnull().mean() * 100).T


In [37]:
import json
import tempfile
import os

# Specificeer handmatig de food_id waarvoor je de data wilt bekijken
selected_food_id = 358  # <-- Pas deze waarde aan

# Filter de dataset op de geselecteerde food_id
filtered_df = foodb_meat[foodb_meat["food_id"] == selected_food_id]

# Converteer de gefilterde DataFrame naar JSON
json_data = filtered_df.to_dict(orient="records")
temp_json_path = os.path.join(tempfile.gettempdir(), "temp_json_view.json")

# Schrijf de JSON, met elk object op een aparte regel
with open(temp_json_path, "w", encoding="utf-8") as f:
    for record in json_data:
        json.dump(record, f, separators=(',', ':'), ensure_ascii=False)
        f.write("\n")  # Nieuwe regel voor elk object

# Open bestand in VS Code
os.system(f"code {temp_json_path}")

print(f"JSON-bestand geopend in VS Code voor food_id {selected_food_id}")


JSON-bestand geopend in VS Code voor food_id 358
