# Source:

https://fdc.nal.usda.gov/download-datasets.html

In [1]:
import json

In [2]:
file_path = './sr_legacy_food_json_2021-10-28.json'

with open(file_path, 'r') as file:
    data = json.load(file)

data = data['SRLegacyFoods']


In [3]:
def extract_nested_values(data, fields):
    def get_nested_value(d, keys):
        if not keys:
            return d
        key = keys[0]
        if isinstance(d, list):
            return [get_nested_value(item, keys) for item in d]
        elif isinstance(d, dict):
            if key in d:
                return get_nested_value(d[key], keys[1:])
            else:
                return None
        else:
            return None

    extracted_data = {}
    for field in fields:
        keys = field.split('.')
        if keys[0] == "foodNutrients":
            nutrients = data.get("foodNutrients", [])
            extracted_nutrients = []
            for nutrient in nutrients:
                nutrient_data = {
                    "name": get_nested_value(nutrient, ["nutrient", "name"]),
                    "unitName": get_nested_value(nutrient, ["nutrient", "unitName"]),
                    "amount": get_nested_value(nutrient, ["amount"])
                }
                extracted_nutrients.append(nutrient_data)
            extracted_data["foodNutrients"] = extracted_nutrients
        elif keys[0] == "foodCategory":
            food_category = data.get("foodCategory", {})
            category_data = {
                "description": get_nested_value(food_category, ["description"])
            }
            extracted_data["foodCategory"] = category_data
        else:
            extracted_data[field] = get_nested_value(data, keys)

    return extracted_data


def process_data_list(data_list, fields):
    return [extract_nested_values(data, fields) for data in data_list]


fields = [
    "description",
    "foodNutrients.nutrient.name",
    "foodNutrients.nutrient.unitName",
    "foodNutrients.amount",
    "publicationDate",
    "foodCategory.description"
]

extracted_data = process_data_list(data, fields)

## DB Schema Design
1. Food Table:
- food_id (Primary Key)
- food_name
- category_id (Foreign Key)
- publication_date

2. Category Table:
- category_id (Primary Key)
- category_name

3. Nutrient Table:
- nutrient_id (Primary Key)
- nutrient_name

4. FoodNutrient Table (Mapping Table):
- food_nutrient_id (Primary Key)
- food_id (Foreign Key)
- nutrient_id (Foreign Key)
- amount
- unit_name

### ER Diagram:
- Food -> Category (Many to One)
- Food -> FoodNutrient (One to Many)
- Nutrient -> FoodNutrient (One to Many)


### Indexing
To optimize search queries, particularly semantic searches, consider:

1. Full-text Index:
For `food_name` and `nutrient_name`, create a full-text index to improve the performance of text searches.

2. B-tree Index:
On `category_id`, `publication_date` to optimize filter queries.


# Store data into DB

In [4]:
import sys
import os

# Set the root directory (one level up from the current directory)
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Add the root directory to sys.path
if root_dir not in sys.path:
    sys.path.insert(0, root_dir)

root_dir

'/Users/albert/Downloads/project/nutrisense'

In [5]:
from src.container import Application
# from src.modules.food.domain.models.food import Food
# from src.modules.food.domain.models.category import Category

application = Application()
application.config.from_yaml("../../src/config.yml")


# embedding_service_factory = application.llm_package.embedding_service
# food_repo_factory = application.food_package.food_repo

factory = application.food_package.food_service
# embedding_service = embedding_service_factory()
# food_repo = food_repo_factory()
instance = factory()

In [6]:
len(extracted_data)

7793

In [7]:
instance.load_data(extracted_data[1500:2500])

  0%|          | 0/49 [00:00<?, ?it/s][32m[INFO] 2024-06-23 14:17:42,082[0m | module: src.modules.food.domain.food_service | line no: 37 | Processing item 1/49: Turnover, filled with egg, meat and cheese, frozen[0m
[32m[INFO] 2024-06-23 14:17:42,086[0m | module: src.modules.llm.infrastructure.openai_repository | line no: 23 | Generating embedding for text: Turnover, filled with egg, meat and cheese, frozen[0m
[32m[INFO] 2024-06-23 14:17:42,791[0m | module: src.modules.food.infrastructure.data.category_repository | line no: 32 | Getting or creating category with name: Meals, Entrees, and Side Dishes[0m
[32m[INFO] 2024-06-23 14:17:42,792[0m | module: src.modules.food.infrastructure.data.category_repository | line no: 17 | Getting category with name: Meals, Entrees, and Side Dishes[0m
[32m[INFO] 2024-06-23 14:17:43,823[0m | module: src.modules.food.infrastructure.data.food_repository | line no: 35 | Getting or creating food with name: Turnover, filled with egg, meat and chee