## The research part

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import joblib

Reading given file and converting it to data frame

In [2]:
df=pd.read_csv('./data/epi_r.csv')
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
len(df)

20052

In [4]:
df.columns

Index(['title', 'rating', 'calories', 'protein', 'fat', 'sodium', '#cakeweek',
       '#wasteless', '22-minute meals', '3-ingredient recipes',
       ...
       'yellow squash', 'yogurt', 'yonkers', 'yuca', 'zucchini', 'cookbooks',
       'leftovers', 'snack', 'snack week', 'turkey'],
      dtype='object', length=680)

Whitelist for filtering

In [5]:
whitelist = {
    'milk', 'sugar', 'egg', 'butter', 'salt', 'pepper', 'carrot', 'onion',
    'chicken', 'beef', 'pork', 'fish', 'rice', 'bread', 'flour', 'water',
    'garlic', 'honey', 'jam', 'tomato', 'cheese', 'cream', 'spinach',
    'yogurt', 'zucchini', 'potato', 'almond', 'banana', 'apple', 'lemon',
    'lime', 'orange', 'cucumber', 'celery', 'parsley', 'basil', 'oregano',
    'thyme', 'rosemary', 'cinnamon', 'nutmeg', 'ginger', 'vanilla', 'chocolate',
    'coffee', 'tea', 'vinegar', 'oil', 'olive oil', 'vegetable oil', 'buttermilk',
    'mayonnaise', 'ketchup', 'mustard', 'soy sauce', 'tofu', 'lentils',
    'beans', 'peas', 'corn', 'pumpkin', 'mushroom', 'onion powder', 'garlic powder',
    'paprika', 'cumin', 'coriander', 'turmeric', 'chili', 'cabbage', 'lettuce',
    'broccoli', 'cauliflower', 'eggplant', 'sweet potato', 'maple syrup',
    'molasses', 'baking powder', 'baking soda', 'yeast', 'sour cream',
    'coconut', 'pineapple', 'strawberry', 'blueberry', 'raspberry',
    'blackberry', 'peach', 'pear', 'plum', 'apricot', 'fig', 'date',
    'walnut', 'pecan', 'cashew', 'pistachio', 'hazelnut', 'sesame',
    'chia', 'quinoa', 'barley', 'oats', 'buckwheat', 'rye', 'spelt',
    'mustard seed', 'clove', 'anise', 'cardamom', 'star anise', 'bay leaf',
    'dill', 'chives', 'scallion', 'shallot', 'leek', 'coriander leaf',
    'capsicum', 'bell pepper', 'jalapeno', 'habanero', 'chili powder',
    'brown sugar', 'powdered sugar', 'cornstarch', 'arrowroot', 'gelatin',
    'almond milk', 'coconut milk', 'rice milk', 'cashew milk',
    'maple syrup', 'agave', 'sesame oil', 'peanut', 'peanut butter',
    'beef broth', 'chicken broth', 'vegetable broth', 'mayo',
    'whipped cream', 'cream cheese', 'feta', 'mozzarella', 'parmesan',
    'cheddar', 'swiss', 'gouda', 'brie', 'ricotta', 'blue cheese',
    'tofu', 'tempeh', 'seitan', 'black beans', 'kidney beans',
    'chickpeas', 'edamame', 'seaweed', 'wasabi', 'sriracha',
    'tahini', 'miso', 'salsa', 'guacamole', 'curry powder',
    'curry paste', 'lemongrass', 'galangal', 'kaffir lime leaf',
    'ginger powder', 'cocoa powder', 'dark chocolate', 'white chocolate',
    'matcha', 'espresso', 'espresso powder', 'coconut sugar',
    'brown rice', 'wild rice', 'basmati rice', 'jasmine rice',
    'zucchini noodles', 'pasta', 'noodles', 'breadcrumbs',
    'chicken breast', 'ground beef', 'ground turkey',
    'bacon', 'sausage', 'ham', 'salami', 'prosciutto', 'shrimp',
    'crab', 'lobster', 'salmon', 'tuna', 'cod', 'tilapia', 'trout',
    'scallops', 'oysters', 'mussels', 'clams', 'octopus',
    'egg yolk', 'egg white', 'quail egg', 'duck egg',
    'butternut squash', 'acorn squash', 'spaghetti squash',
    'sweet corn', 'baby corn', 'edible flowers', 'microgreens',
    'arugula', 'kale', 'collard greens', 'bok choy',
    'brussels sprouts', 'artichoke', 'asparagus',
    'beets', 'fennel', 'turnip', 'parsnip', 'radish', 'rutabaga',
    'watercress', 'endive', 'chicory', 'dandelion greens',
    'mustard greens', 'pea shoots', 'snap peas', 'snow peas',
    'tomatillo', 'avocado', 'plantain', 'cassava', 'yam',
    'sorghum', 'teff', 'amaranth', 'millet',
}

ingredient_columns = [col for col in df.columns if col.lower() in whitelist]


df_ingredients = df[ingredient_columns]

  


In [6]:
df_ingredients.columns

Index(['almond', 'anise', 'apple', 'apricot', 'artichoke', 'arugula',
       'asparagus', 'avocado', 'bacon', 'banana',
       ...
       'tuna', 'turnip', 'vanilla', 'vinegar', 'walnut', 'wasabi',
       'watercress', 'wild rice', 'yogurt', 'zucchini'],
      dtype='object', length=139)

Making **X** and **y**

In [7]:

X=df_ingredients.copy()
y=df['rating']

Saving colimns order for future using

In [8]:
joblib.dump(X.columns.tolist(), "./data/ingredient_columns.pkl")

['./data/ingredient_columns.pkl']

## The regression part

In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error


Spliting X and y on test and train cases

In [10]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=21)

Calculating the Naive RMSE

In [11]:
mean_rating=y_train.mean()
y_pred_naive=np.full_like(y_test, fill_value=mean_rating, dtype=np.float64)

rmse_naive=np.sqrt(mean_squared_error(y_test, y_pred_naive))

print(f"Naive regrtessor RMSE: {rmse_naive}")

Naive regrtessor RMSE: 1.307237065771332


Grid Searching best params for model

In [12]:
models_params = {
    'Ridge': {
        'model': Ridge(random_state=21),
        'params': {'alpha': [0.1, 1, 10]}
    },
    'Lasso': {
        'model': Lasso(random_state=21),
        'params': {
            'alpha': [0.001, 0.01, 0.1, 1],
            'max_iter': [1000, 5000, 10000]
        }
    },
    'ElasticNet': {
        'model': ElasticNet(random_state=21),
        'params': {
            'alpha': [0.1, 1, 10],
            'l1_ratio': [0.1, 0.5, 0.9],
            'max_iter': [1000, 5000]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=21),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20, None]
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=21),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5, 10]
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=21),
        'params': {
            'max_depth': [3, 5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'sqrt', 'log2'],  
            'max_leaf_nodes': [None, 10, 20, 50],
            'min_weight_fraction_leaf': [0.0, 0.01, 0.05]
        }
    }
}


best_model={}

for name, mp, in models_params.items():
    print(f"Teach and search hyperparameters for {name}")
    grid=GridSearchCV(mp['model'], mp['params'], cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model[name]=grid.best_estimator_
    print(f"The best params{grid.best_params_}")

Teach and search hyperparameters for Ridge
The best params{'alpha': 10}
Teach and search hyperparameters for Lasso
The best params{'alpha': 0.001, 'max_iter': 1000}
Teach and search hyperparameters for ElasticNet
The best params{'alpha': 0.1, 'l1_ratio': 0.1, 'max_iter': 1000}
Teach and search hyperparameters for RandomForest
The best params{'max_depth': 20, 'n_estimators': 200}
Teach and search hyperparameters for GradientBoostingRegressor
The best params{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Teach and search hyperparameters for DecisionTreeRegressor
The best params{'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01}


In [13]:
for name, model in best_model.items():
    y_pred=model.predict(X_test)
    rmse=np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} RMSE on test: {rmse:.4f}")

Ridge RMSE on test: 1.2893
Lasso RMSE on test: 1.2890
ElasticNet RMSE on test: 1.3056
RandomForest RMSE on test: 1.2976
GradientBoostingRegressor RMSE on test: 1.2912
DecisionTreeRegressor RMSE on test: 1.2959


The best regression model is **Lasso** with this params: **{'alpha': 0.001, 'max_iter': 1000}**

## The classification part

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


Do the rounding for classification cases

In [15]:
df['rating_class']=df['rating'].round().astype(int)

y=df['rating_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


In [16]:
models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=5000),
        'params': {'C': [0.01, 0.1, 1, 10]}
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=21),
        'params': {'n_estimators': [50, 100], 'max_depth': [5, 10, 20]}
    },
    'SVC': {
        'model': SVC(),
        'params': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=21),
        'params': {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7, 9]}
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {} 
    },
    'MLP': {
        'model': MLPClassifier(max_iter=1000, random_state=21),
        'params': {'hidden_layer_sizes': [(50,), (100,)], 'alpha': [0.0001, 0.001]}
    }
}


Grid searching for classification models

In [17]:
best_models = {}
for name, mp in models_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    best_models[name] = clf
    print(f"{name}:")
    print("  Best params:", clf.best_params_)
    print("  Best CV accuracy:", clf.best_score_)
    print()


LogisticRegression:
  Best params: {'C': 0.1}
  Best CV accuracy: 0.6548220248692697

RandomForest:
  Best params: {'max_depth': 20, 'n_estimators': 50}
  Best CV accuracy: 0.6547596613017161

SVC:
  Best params: {'C': 1, 'kernel': 'rbf'}
  Best CV accuracy: 0.6552584532747284

GradientBoosting:
  Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
  Best CV accuracy: 0.6543856353196162

KNN:
  Best params: {'n_neighbors': 9}
  Best CV accuracy: 0.6283890810524329

NaiveBayes:
  Best params: {}
  Best CV accuracy: 0.027866004201089672





MLP:
  Best params: {'alpha': 0.001, 'hidden_layer_sizes': (100,)}
  Best CV accuracy: 0.5961606578754111



In [18]:
for name, clf in best_models.items():
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {acc:.4f}")


LogisticRegression Test Accuracy: 0.6707
RandomForest Test Accuracy: 0.6722
SVC Test Accuracy: 0.6724
GradientBoosting Test Accuracy: 0.6694
KNN Test Accuracy: 0.6507
NaiveBayes Test Accuracy: 0.0239
MLP Test Accuracy: 0.6006


In [19]:
most_common_class = df['rating_class'].mode()[0]
naive_accuracy = (y_test == most_common_class).mean()
print(f"The naive accuarcy: {naive_accuracy:.4f}")

The naive accuarcy: 0.6707


Categoraze the rating on bad, so-so, great

In [20]:
def categoraze_rating(r):
    if r in [0,1]:
        return 'bad'
    elif r in [2,3]:
        return 'so-so'
    elif r in [4,5]:
        return 'great'

df['rating_category']=df['rating_class'].apply(categoraze_rating)

In [21]:
y=df['rating_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [22]:
models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=5000),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'class_weight': ['balanced']  
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=21),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [5, 10, 20],
            'class_weight': ['balanced', 'balanced_subsample']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'class_weight': ['balanced']
        }
    }
}


Grid Searching

In [23]:
best_models = {}
for name, mp in models_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    best_models[name] = clf
    print(f"{name}:")
    print("  Best params:", clf.best_params_)
    print("  Best CV accuracy:", clf.best_score_)
    print()


LogisticRegression:
  Best params: {'C': 1, 'class_weight': 'balanced'}
  Best CV accuracy: 0.41568535035113985

RandomForest:
  Best params: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 100}
  Best CV accuracy: 0.5166135961125544

SVC:
  Best params: {'C': 10, 'class_weight': 'balanced', 'kernel': 'rbf'}
  Best CV accuracy: 0.5513996055358643



In [24]:
for name, clf in best_models.items():
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {acc:.4f}")

LogisticRegression Test Accuracy: 0.4191
RandomForest Test Accuracy: 0.5196
SVC Test Accuracy: 0.5585


Researching part how to improve the model

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(
    max_depth=10,
    min_samples_leaf=1,
    n_estimators=50,
    class_weight='balanced_subsample',
    random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.14      0.72      0.24       372
       great       0.85      0.44      0.58      3200
       so-so       0.15      0.17      0.16       439

    accuracy                           0.44      4011
   macro avg       0.38      0.44      0.33      4011
weighted avg       0.71      0.44      0.50      4011



In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=21)


grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=21, class_weight='balanced_subsample'),
    param_grid={
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [1, 2, 4]
    },
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best weighted F1:", grid.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params: {'max_depth': 30, 'min_samples_leaf': 1, 'n_estimators': 200}
Best weighted F1: 0.3849074096507138


In [27]:
from sklearn.metrics import classification_report

y_pred = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

         bad      0.165     0.624     0.260       372
       great      0.852     0.607     0.709      3200
       so-so      0.189     0.139     0.160       439

    accuracy                          0.557      4011
   macro avg      0.402     0.456     0.376      4011
weighted avg      0.715     0.557     0.607      4011



In [28]:
best_rf_model=RandomForestClassifier(random_state=21)
params_rf = {
    'n_estimators': [50, 100],
    'max_depth': [20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'class_weight': [None]
}


grid_rf=GridSearchCV(best_rf_model, params_rf, cv=5,scoring='accuracy',n_jobs=-1)
grid_rf.fit(X_train, y_train)
print("  Best params:", grid_rf.best_params_)
print("  Best CV accuracy:", grid_rf.best_score_)

  Best params: {'class_weight': None, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
  Best CV accuracy: 0.7923446098061173


In [29]:

from sklearn.utils import resample
import pandas as pd


df_train = X_train.copy()
df_train['target'] = y_train


df_great = df_train[df_train['target'] == 'great']
df_bad = df_train[df_train['target'] == 'bad']
df_soso = df_train[df_train['target'] == 'so-so']


n_samples = min(len(df_bad), len(df_soso))
df_great_down = resample(df_great, replace=False, n_samples=n_samples, random_state=42)


df_balanced = pd.concat([df_great_down, df_bad, df_soso])
X_bal = df_balanced.drop(columns='target')
y_bal = df_balanced['target']


cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=21)


grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=21, class_weight='balanced_subsample'),
    param_grid={
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [1, 2, 4]
    },
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_bal, y_bal)
print("Best params:", grid.best_params_)
print("Best weighted F1:", grid.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best params: {'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 200}
Best weighted F1: 0.432210251113532


In [30]:
y_pred = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         bad       0.15      0.70      0.25       372
       great       0.86      0.35      0.50      3200
       so-so       0.13      0.28      0.17       439

    accuracy                           0.38      4011
   macro avg       0.38      0.44      0.31      4011
weighted avg       0.72      0.38      0.44      4011



Saving the balanced model

In [31]:
import joblib
joblib.dump(grid.best_estimator_, './data/balanced_random_forest.pkl')


['./data/balanced_random_forest.pkl']

Accuracy of non balanced model

In [32]:
from sklearn.ensemble import RandomForestClassifier
best_model=RandomForestClassifier(class_weight=None, max_depth=20, max_features='sqrt', min_samples_leaf=2, min_samples_split=2, n_estimators=50, random_state=21)
best_model.fit(X_train, y_train)

y_pred=best_model.predict(X_test)

acc=accuracy_score(y_test, y_pred)

print(f"The accuarcy: {acc*100:.2f}%")

The accuarcy: 79.78%


Saving non balanced model

In [33]:
import joblib

joblib.dump(best_model,'./data/model_79_acc.pkl')

['./data/model_79_acc.pkl']

## Nutrients

Loading the api key from .env

In [34]:
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv("API_KEY")

Parsing the Nutrients from API

In [35]:
import httpx 
import time 
import json
import nest_asyncio
import asyncio
BASE_URL="https://api.nal.usda.gov/fdc/v1/foods/search"

async def fetch_fdc_id(client, ingredient):
    params = {
        "query": ingredient,
        "pageSize": 1,
        "api_key": api_key
    }
    try:
        r = await client.get(BASE_URL, params=params)
        r.raise_for_status()
        data = r.json()
        if data["foods"]:
            return ingredient, data["foods"][0]["fdcId"]
    except Exception as e:
        print(f"{ingredient}: error — {e}")
    return ingredient, None

async def main(ingredients):
    fdc_ids = {}
    async with httpx.AsyncClient(timeout=10) as client:
        tasks = [fetch_fdc_id(client, ing) for ing in ingredients]
        results = await asyncio.gather(*tasks)
        fdc_ids = dict(results)

    with open("./data/fdc_ids_async.json", "w") as f:
        json.dump(fdc_ids, f, indent=2)

    print("Готово. Найдено:", len(fdc_ids))

ingredients = df_ingredients.columns.tolist()
nest_asyncio.apply() 
await main(ingredients)

Готово. Найдено: 139


In [36]:
BASE_URL = 'https://api.nal.usda.gov/fdc/v1/food/'

async def get_nutrients(fcd_id):
    url=f"{BASE_URL}{fcd_id}?api_key={api_key}"
    
    async with httpx.AsyncClient(timeout=10) as client:
        try:
            resp=await client.get(url)
            if resp.status_code==200:
                data=resp.json()
                nutrients={}
                for nutrient in data.get('foodNutrients', []):
                    name=nutrient.get('nutrient', {}).get('name')
                    amount=nutrient.get('amount')
                    unit=nutrient.get('nutrient', {}).get('unitName')
                    if name and amount:
                        nutrients[name]=(amount, unit)
                return nutrients
        except Exception as e:
            print(e)
        return None



In [37]:
        
with open("./data/fdc_ids_async.json", "r") as f:
    ingredient_ids = json.load(f)

results={}

async def fetch_all():
    for ing, fdc_id in ingredient_ids.items():
        nutrients = await get_nutrients(fdc_id)
        results[ing] = nutrients

await fetch_all()




The data about nutrients and daily gram given from the task

In [38]:
whitelist_nutrients={'Fat':78, 
                     'Saturated fat':20,
                     'Cholesterol':300, 
                     'Total carbohydrates':275,
                     'Sodium':0.23,
                     'Dietary Fiber':28, 
                     'Protein':50,
                     'Added sugars':50,
                     'Vitamin A':0.9, 
                     'Vitamin C':0.09,
                     'Calcium':0.13,
                     'Iron':0.18, 'Vitamin D':0.2,'Vitamin E':0.15, 'Vitamin K':0.00012, 'Thiamin':0.0012, 'Riboflavin':0.0013, 'Niacin':0.016, 'Vitamin B6':0.0017,
                     'Folate6':0.0004, 'Vitamin B12':0.00000024,
                     'Biotin':0.00003, 'Pantothenic acid':0.005,'Phosphorus':1.25,'Iodine':0.00015, 'Magnesium':0.420, 'Zinc':0.011, 'Selenium':0.00055, 'Copper':0.0009, 
                     'Manganese':0.0023, 
                     'Chromium':0.000035, 
                     'Molybdenum':0.000045,
                     'Chloride':2.3,
                     'Potassium':4.7, 'Choline':0.55}



Calculating the daily % of nutrients

In [39]:
whitelist_nutrients = {
    'Fat': 78,  
    'Saturated fat': 20,  
    'Cholesterol': 300,  
    'Total carbohydrates': 275, 
    'Sodium': 2300, 
    'Dietary Fiber': 28,  
    'Protein': 50,  
    'Added sugars': 50,  
    'Vitamin A': 900,  
    'Vitamin C': 90,  
    'Calcium': 1300,  
    'Iron': 18, 
    'Vitamin D': 20, 
    'Vitamin E': 15,  
    'Vitamin K': 120, 
    'Thiamin': 1.2, 
    'Riboflavin': 1.3,  
    'Niacin': 16, 
    'Vitamin B6': 1.7,  
    'Folate': 400,  
    'Vitamin B12': 2.4,  
    'Biotin': 30,  
    'Pantothenic acid': 5, 
    'Phosphorus': 1250,  
    'Iodine': 150,  
    'Magnesium': 420,  
    'Zinc': 11,  
    'Selenium': 55,  
    'Copper': 0.9, 
    'Manganese': 2.3,  
    'Chromium': 35,  
    'Molybdenum': 45,  
    'Chloride': 2300,  
    'Potassium': 4700,  
    'Choline': 550  
}


usda_to_common = {
    'Total lipid (fat)': 'Fat',
    'Carbohydrate, by difference': 'Total carbohydrates',
    'Fiber, total dietary': 'Dietary Fiber',
    'Total Sugars': 'Added sugars',
    'Calcium, Ca': 'Calcium',
    'Iron, Fe': 'Iron',
    'Magnesium, Mg': 'Magnesium',
    'Phosphorus, P': 'Phosphorus',
    'Potassium, K': 'Potassium',
    'Sodium, Na': 'Sodium',
    'Zinc, Zn': 'Zinc',
    'Copper, Cu': 'Copper',
    'Selenium, Se': 'Selenium',
    'Thiamin': 'Thiamin',
    'Riboflavin': 'Riboflavin',
    'Niacin': 'Niacin',
    'Vitamin B-6': 'Vitamin B6',
    'Folate, total': 'Folate',
    'Vitamin B-12': 'Vitamin B12',
    'Vitamin E (alpha-tocopherol)': 'Vitamin E',
    'Choline, total': 'Choline'
}


nutrient_data = {}


for ingredient, nutrients in results.items():
    if nutrients is None:
        continue 

    nutrient_row = {}
    for n_raw, (amount, unit) in nutrients.items():
        n = usda_to_common.get(n_raw, n_raw)
        if n not in whitelist_nutrients or amount is None:
            continue

        dv = whitelist_nutrients[n]

        
        if unit == 'g':
            val = amount
        elif unit == 'mg':
            val = amount / 1000  
        elif unit == 'µg':
            val = amount / 1_000_000 
        else:
            continue  

        percent_dv = (val / dv) * 100
        nutrient_row[n] = round(percent_dv, 2)

    nutrient_data[ingredient] = nutrient_row


df_nutrients = pd.DataFrame.from_dict(nutrient_data, orient='index').fillna(0)
df_nutrients.head()

df_nutrients.to_csv('./data/daily_nutrients.csv')

Parsing additional ingridient

In [40]:
import pandas as pd
import requests
import json

api_key='n4S3AoVDZCbUlaRHCePehlSboy2ELIedPCmaLR0S'
query = 'jam'
url = f'https://api.nal.usda.gov/fdc/v1/foods/search?api_key={api_key}&query={query}'

response = requests.get(url)
data = response.json()
with open('./data/usda_food_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)

In [41]:
with open("./data/usda_food_data.json", "r") as f:
    data = json.load(f)

foods = data["foods"]

usda_to_common = {
    'Total lipid (fat)': 'Fat',
    'Carbohydrate, by difference': 'Total carbohydrates',
    'Fiber, total dietary': 'Dietary Fiber',
    'Total Sugars': 'Added sugars',
    'Calcium, Ca': 'Calcium',
    'Iron, Fe': 'Iron',
    'Magnesium, Mg': 'Magnesium',
    'Phosphorus, P': 'Phosphorus',
    'Potassium, K': 'Potassium',
    'Sodium, Na': 'Sodium',
    'Zinc, Zn': 'Zinc',
    'Copper, Cu': 'Copper',
    'Selenium, Se': 'Selenium',
    'Thiamin': 'Thiamin',
    'Riboflavin': 'Riboflavin',
    'Niacin': 'Niacin',
    'Vitamin B-6': 'Vitamin B6',
    'Folate, total': 'Folate',
    'Vitamin B-12': 'Vitamin B12',
    'Vitamin E (alpha-tocopherol)': 'Vitamin E',
    'Choline, total': 'Choline',
    'Protein': 'Protein',
    'Cholesterol': 'Cholesterol',
    'Vitamin D (D2 + D3)': 'Vitamin D',
    'Vitamin A, RAE': 'Vitamin A',
}

whitelist_nutrients = {
    'Fat': 78,
    'Saturated fat': 20,
    'Cholesterol': 300,
    'Total carbohydrates': 275,
    'Sodium': 2300,
    'Dietary Fiber': 28,
    'Protein': 50,
    'Added sugars': 50,
    'Vitamin A': 900,
    'Vitamin C': 90,
    'Calcium': 1300,
    'Iron': 18,
    'Vitamin D': 20,
    'Vitamin E': 15,
    'Vitamin K': 120,
    'Thiamin': 1.2,
    'Riboflavin': 1.3,
    'Niacin': 16,
    'Vitamin B6': 1.7,
    'Folate': 400,
    'Vitamin B12': 2.4,
    'Biotin': 30,
    'Pantothenic acid': 5,
    'Phosphorus': 1250,
    'Iodine': 150,
    'Magnesium': 420,
    'Zinc': 11,
    'Selenium': 55,
    'Copper': 0.9,
    'Manganese': 2.3,
    'Chromium': 35,
    'Molybdenum': 45,
    'Chloride': 2300,
    'Potassium': 4700,
    'Choline': 550
}

nutrient_data = {}

for food in foods:
    desc = food["description"].lower()
    nutrient_row = {}

    for nutrient in food.get("foodNutrients", []):
        raw_name = nutrient["nutrientName"]
        value = nutrient.get("value")
        unit = nutrient.get("unitName", "").lower()

        common_name = usda_to_common.get(raw_name, raw_name)
        if common_name not in whitelist_nutrients or value is None:
            continue

        # Перевод в граммы
        if unit == "g":
            val = value
        elif unit == "mg":
            val = value / 1000
        elif unit == "µg":
            val = value / 1_000_000
        else:
            continue

        dv = whitelist_nutrients[common_name]
        percent = (val / dv) * 100
        nutrient_row[common_name] = round(percent, 2)

    nutrient_data[desc] = nutrient_row

df_usda = pd.DataFrame.from_dict(nutrient_data, orient="index").fillna(0)
df_usda

Unnamed: 0,Protein,Fat,Total carbohydrates,Dietary Fiber,Phosphorus,Thiamin,Riboflavin,Cholesterol,Sodium,Choline,...,Calcium,Potassium,Zinc,Niacin,Pantothenic acid,Vitamin B6,Copper,Iron,Magnesium,Vitamin E
jam,0.8,0.13,23.78,7.86,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jams and preserves,0.74,0.09,25.05,3.93,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
"jam or jelly, nfs",0.36,0.04,24.41,3.57,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jams and preserves, apricot",1.4,0.26,23.42,1.07,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
"jam or jelly, reduced sugar",0.0,0.13,13.67,5.36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jam or jelly, sugar free",1.1,0.0,10.76,7.86,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jam or jelly, fruit juice sweetened",0.0,0.0,19.25,3.21,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jams, preserves, marmalade, reduced sugar",0.0,0.13,13.67,5.36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jams, preserves, marmalades, sweetened with fruit juice",0.0,0.0,19.24,3.21,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"jams and preserves, no sugar (with sodium saccharin), any flavor",0.6,0.38,19.42,8.93,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Saving csv of nutrients

In [42]:
df_existing = pd.read_csv("./data/new_daily_nutrients.csv")
df_existing.set_index(df_existing.columns[0], inplace=True)
df_combined = pd.concat([df_existing, df_usda])
df_combined = df_combined[~df_combined.index.duplicated(keep='first')]

df_combined.to_csv('./data/new_daily_nutrients.csv')