In [1]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import json
import os

In [2]:
def load_api_key(): 
    config_path = 'api_config/config.json'
    
    if not os.path.exists(config_path): 
        raise FileNotFoundError(f'Конфиг {config_path} не найден')
    
    with open(config_path, 'r') as f: 
        config = json.load(f)
    
    return config.get('usda_api_key')

In [3]:
class NutritionParser:
    def __init__(self, api_key): 
        self.api_key = api_key
        self.base_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
        self.nutrients_list = {
                'Vitamin A': 900,
                'Vitamin C': 90,
                'Calcium': 1300,
                'Iron': 18,
                'Vitamin D': 20,
                'Vitamin E': 15,
                'Vitamin K': 120,
                'Thiamin': 1.2,
                'Riboflavin': 1.3,
                'Niacin': 16,
                'Vitamin B6': 1.7,
                'Folate': 400,
                'Vitamin B12': 2.4,
                'Biotin': 30,
                'Pantothenic acid': 5,
                'Phosphorus': 1250,
                'Iodine': 150,
                'Magnesium': 420,
                'Zinc': 11,
                'Selenium': 55,
                'Copper': 0.9,
                'Manganese': 2.3,
                'Chromium': 35,
                'Molybdenum': 45,
                'Chloride': 2300,
                'Potassium': 4700,
                'Choline': 550,
                'Fat': 78,
                'Saturated fat': 20,
                'Cholesterol': 300,
                'Total carbohydrates': 275,
                'Sodium': 2300,
                'Dietary Fiber': 28,
                'Protein': 50,
                'Added sugars': 50
            }
        
        self.list_of_food = [
                'almond', 'amaretto', 'anchovy', 'anise', 'apple', 'apricot', 'artichoke', 'arugula', 'asparagus', 'avocado',
                'bacon', 'banana', 'barley', 'basil', 'beef', 'beet', 'bell pepper', 'berry', 'blackberry', 'blue cheese',
                'blueberry', 'bok choy', 'bran', 'bread', 'brie', 'broccoli', 'bulgur', 'burrito', 'butter', 'buttermilk',
                'butternut squash', 'cabbage', 'candy', 'cantaloupe', 'capers', 'carrot', 'cashew', 'cauliflower', 'caviar',
                'celery', 'cheddar', 'cheese', 'cherry', 'chestnut', 'chicken', 'chickpea', 'chile pepper', 'chili', 'chive',
                'chocolate', 'coconut', 'cod', 'coriander', 'corn', 'crab', 'cranberry', 'cream cheese', 'cucumber', 'curry',
                'custard', 'dairy', 'date', 'duck', 'egg', 'eggplant', 'endive', 'fennel', 'feta', 'fig', 'fish', 'garlic',
                'goat cheese', 'gouda', 'grape', 'grapefruit', 'green bean', 'ham', 'hamburger',
                'hazelnut', 'honey', 'hummus', 'ice cream', 'jalapeño', 'kale', 'kiwi', 'lamb', 'lemon', 'lentil', 'lettuce',
                'lima bean', 'lime', 'lobster', 'macaroni and cheese', 'mango', 'maple syrup', 'mayonnaise', 'meatball',
                'melon', 'mint', 'mushroom', 'mussel', 'mustard', 'nutmeg', 'oatmeal', 'olive', 'omelet', 'onion', 'orange',
                'oregano', 'oyster', 'pancake', 'papaya', 'paprika', 'parmesan', 'parsley', 'parsnip', 'pasta', 'peanut',
                'pear', 'pecan', 'pepper', 'persimmon', 'pineapple', 'pistachio', 'pizza', 'plum', 'pomegranate', 'pork',
                'potato', 'poultry', 'prosciutto', 'prune', 'pumpkin', 'quail', 'quinoa', 'radish', 'raisin', 'raspberry',
                'rice', 'ricotta', 'rosemary', 'salmon', 'salsa', 'sausage', 'scallop', 'seafood', 'sesame', 'shallot',
                'shrimp', 'spinach', 'squash', 'steak', 'strawberry', 'sugar snap pea', 'swiss cheese',
                'tangerine', 'tapioca', 'tarragon', 'tea', 'thyme', 'tilapia', 'tofu', 'tomato', 'trout', 'tuna', 'turnip',
                'vanilla', 'veal', 'vegetable', 'walnut', 'wasabi', 'watermelon', 'wild rice', 'yellow squash', 'yogurt',
                'zucchini'
            ]
    
    def get_ingredient_info(self, ingredient_name): 
        try: 
            r = requests.get('https://api.nal.usda.gov/fdc/v1/foods/search', 
                params={'api_key': self.api_key,
                        'query':ingredient_name})
            
            json = r.json()
            nutrients_dict = {}
            
            for nutrient in json['foods'][0]['foodNutrients']: 
                name = nutrient['nutrientName'].split(',')[0]
            if name == 'fat': 
                print(name)
                if name in self.nutrients_list.keys():
                    nutrients_dict[name] = nutrient['value'] / self.nutrients_list[name] * 100
            
            return nutrients_dict
        
        except Exception as e: 
            print(f"Произошла ошибка при парсинге. Имя ошибки {e}") 
            return None
        
    
    def get_all_nutrients(self): 
        with ThreadPoolExecutor(max_workers=50) as executor: 
            all_ingredients = {ingr:nutr for ingr, nutr in zip(self.list_of_food, list(tqdm(executor.map(self.get_ingredient_info, self.list_of_food), total=len(self.list_of_food))))}
        
        return all_ingredients
    
    def get_nutrient_df(self, nutrients: dict): 
        df_nutrients = pd.DataFrame(nutrients).T
        df_nutrients.fillna(0, inplace=True)
        
        return df_nutrients
            

In [4]:
class AddLinks: 
    def __init__(self, df): 
        self.df = df 
    
    def get_link(self, title):
        title = title.strip().lower() 

        try: 
            with requests.Session() as session: 
                response = session.get(
                    'https://www.epicurious.com/search', 
                    params={'q': title}, 
                    timeout=10, 
                    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
                )
                soup = BeautifulSoup(response.text, 'html.parser') 
                recipe_links = soup.find_all('a', href=lambda x: x and '/recipes/food/views/' in x)
                
                for link in recipe_links: 
                    card = link.find_parent('article') or link.find_parent('div')
                    # print(card)
                    if card: 
                        title_elem = card.find('h2', class_='BaseWrap-sc-gzmcOU BaseText-eqOrNE Hed-gQCsFV deqABF gNRPUh fEQvtj')
                        
                        if title_elem: 
                            card_title = title_elem.get_text().strip().lower()
                            if card_title == title:
                                full_url = f"https://www.epicurious.com{link['href']}"
                                return full_url
                return np.nan
            
        except Exception as e: 
            print(f'Произошла ошибка {e}')
            return np.nan
                    
    def links_for_df(self): 
        with ThreadPoolExecutor(max_workers=50) as executor: 
            return list(tqdm(executor.map(self.get_link, self.df[:500]['title']), total=len(self.df[:500])))
                

In [5]:
df = pd.read_csv('../data/clean_epi_r.csv')
df.head(1)

Unnamed: 0,title,rating,calories,protein,fat,sodium,alcoholic,almond,apple,apricot,...,no-cook,slow cooker,pressure cooker,grill,bake,roast,broil,braise,quick & easy.1,advance prep required
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
API_KEY = load_api_key()

In [7]:
nutrients = NutritionParser(API_KEY)

In [8]:
test = nutrients.get_ingredient_info('amaretto')
test

Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)


In [9]:
result = nutrients.get_all_nutrients()

  0%|          | 0/176 [00:00<?, ?it/s]

Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (char 0)
Произошла ошибка при парсинге. Имя ошибки Expecting value: line 1 column 1 (

KeyboardInterrupt: 

In [18]:
result

{'almond': {'Protein': 41.34,
  'Calcium': 20.23076923076923,
  'Iron': 22.666666666666668,
  'Magnesium': 63.33333333333333,
  'Phosphorus': 40.32,
  'Potassium': 15.76595744680851,
  'Sodium': 10.08695652173913,
  'Zinc': 28.72727272727273,
  'Copper': 107.66666666666667,
  'Selenium': 1.4545454545454546,
  'Vitamin A': 0.0,
  'Vitamin C': 0.0,
  'Thiamin': 7.833333333333334,
  'Riboflavin': 71.76923076923077,
  'Niacin': 24.2875,
  'Folate': 10.5,
  'Choline': 9.418181818181818,
  'Vitamin E': 0.0,
  'Cholesterol': 0.0},
 'amaretto': {'Protein': 0.0,
  'Calcium': 0.0,
  'Iron': 0.0,
  'Potassium': 0.0,
  'Sodium': 2.9130434782608696,
  'Cholesterol': 0.0},
 'anchovy': {'Protein': 57.78,
  'Calcium': 17.846153846153847,
  'Iron': 25.722222222222225,
  'Magnesium': 16.428571428571427,
  'Phosphorus': 20.16,
  'Potassium': 11.574468085106384,
  'Sodium': 159.47826086956522,
  'Zinc': 22.18181818181818,
  'Copper': 37.66666666666667,
  'Selenium': 123.8181818181818,
  'Vitamin A': 1.333

In [17]:
df_nutrients = nutrients.get_nutrient_df(result)
df_nutrients.columns

Index(['Protein', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium',
       'Sodium', 'Zinc', 'Copper', 'Selenium', 'Vitamin A', 'Vitamin C',
       'Thiamin', 'Riboflavin', 'Niacin', 'Folate', 'Choline', 'Vitamin E',
       'Cholesterol', 'Pantothenic acid', 'Manganese', 'Biotin'],
      dtype='object')

In [14]:
df_nutrients.to_csv('../data/nutrients.csv')

In [12]:
links = AddLinks(df)
links_column = links.links_for_df()

  0%|          | 0/500 [00:00<?, ?it/s]

Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Произошла ошибка HT

In [13]:
links_column

['https://www.epicurious.com/recipes/food/views/lentil-apple-and-turkey-wrap-239173',
 'https://www.epicurious.com/recipes/food/views/boudin-blanc-terrine-with-red-onion-confit-1085',
 'https://www.epicurious.com/recipes/food/views/potato-and-fennel-soup-hodge-10224',
 'https://www.epicurious.com/recipes/food/views/mahi-mahi-in-tomato-olive-sauce-352551',
 nan,
 'https://www.epicurious.com/recipes/food/views/the-best-blts-101977',
 'https://www.epicurious.com/recipes/food/views/ham-and-spring-vegetable-salad-with-shallot-vinaigrette-5212',
 'https://www.epicurious.com/recipes/food/views/spicy-sweet-kumquats-102726',
 nan,
 'https://www.epicurious.com/recipes/food/views/ham-persillade-with-mustard-potato-salad-and-mashed-peas-243208',
 'https://www.epicurious.com/recipes/food/views/yams-braised-with-cream-rosemary-and-nutmeg-15640',
 nan,
 nan,
 'https://www.epicurious.com/recipes/food/views/beef-tenderloin-with-garlic-and-brandy-5861',
 'https://www.epicurious.com/recipes/food/views/pe

In [14]:
df['link'] = np.nan

In [15]:
df.iloc[:500, df.columns.get_loc('link')] = links_column

In [16]:
for i in ['protein', 'fat', 'sodium']:
    df[i] = df[i]/nutrients.nutrients_list[i.capitalize()]*100

In [17]:
df.rename(columns={
    'protein': 'protein_%',
    'fat': 'fat_%',
    'sodium': 'sodium_%'
}, inplace=True)

In [18]:
df.head(3)

Unnamed: 0,title,rating,calories,protein_%,fat_%,sodium_%,alcoholic,almond,apple,apricot,...,slow cooker,pressure cooker,grill,bake,roast,broil,braise,quick & easy.1,advance prep required,link
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,60.0,8.974359,24.304348,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.epicurious.com/recipes/food/views/...
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,36.0,29.487179,62.565217,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://www.epicurious.com/recipes/food/views/...
2,Potato and Fennel Soup Hodge,3.75,165.0,12.0,8.974359,7.173913,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.epicurious.com/recipes/food/views/...


In [19]:
df.to_csv('../data/clean_epi_r_links.csv', index=False)