## Getting more Data - All_Recipes

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import re
import matplotlib.pyplot as plt
import pickle
import itertools

In [2]:
with open('data/df_common_food.pickle', 'rb') as read_file:
    df_common_food = pickle.load(read_file)

## Web scraping All Recipes (n=so many)
#### Caveat: Foods grouped by region primarily rather than country specific. 

**Get the name of the countries and links to the recipes**

In [154]:
def getRegionNameLink(url):
    '''
    This function takes in the all_recipes world cuisine url and outputs a list of regions and a list of links
    to region pages
    -----
    input: string
    output: list, list
    '''
    response = requests.get(url)
    soup_food = BeautifulSoup(response.text, 'lxml')
    all_regions = [x.text for x in soup_food.find_all(class_ = 'category-title')]
    all_links = [x['href'] for x in soup_food.find_all(class_ = 'grid-col--subnav')]
    return all_links

In [155]:
url = 'https://www.allrecipes.com/recipes/86/world-cuisine/?internalSource=hub%20nav&referringId=231&referringContentType=Recipe%20Hub&referringPosition=1&linkName=hub%20nav%20exposed&clickId=hub%20nav%203'

In [156]:
all_links = getRegionNameLink(url)

In [175]:
region_links = ['https://www.allrecipes.com/recipes/728/world-cuisine/latin-american/'
                ,'https://www.allrecipes.com/recipes/723/world-cuisine/european/', 
               'https://www.allrecipes.com/recipes/227/world-cuisine/asian/', 
               'https://www.allrecipes.com/recipes/235/world-cuisine/middle-eastern/', 
                'https://www.allrecipes.com/recipes/226/world-cuisine/african/',
               'https://www.allrecipes.com/recipes/228/world-cuisine/australian-and-new-zealander/', 
               'https://www.allrecipes.com/recipes/733/world-cuisine/canadian/',
              'https://www.allrecipes.com/recipes/17425/us-recipes/us-recipes-by-state/']

**Scrape each recipe link**

In [176]:
def getSubLinks(region_links):
    '''
    This function takes in a list of region page links and returns a list of country page links.
    -----
    input: list
    output: list
    '''
    all_subs = []
    for region in region_links:
        response = requests.get(region)
        soup_food = BeautifulSoup(response.text, 'lxml')
        sub_links = [x['href'] for x in soup_food.find_all(class_ = 'grid-col--subnav')]
        all_subs.append(sub_links)
    all_subs = list(itertools.chain(*all_subs))
    return all_subs

In [177]:
all_subs = getSubLinks(region_links)

In [198]:
country_links = ['https://www.allrecipes.com/recipes/1470/world-cuisine/latin-american/mexican/authentic/',
 'https://www.allrecipes.com/recipes/1214/world-cuisine/latin-american/mexican/appetizers/',
 'https://www.allrecipes.com/recipes/17504/world-cuisine/latin-american/mexican/main-dishes/',
 'https://www.allrecipes.com/recipes/15936/world-cuisine/latin-american/mexican/drinks/',
 'https://www.allrecipes.com/recipes/1217/world-cuisine/latin-american/mexican/desserts/',
 'https://www.allrecipes.com/recipes/1526/world-cuisine/latin-american/mexican/side-dishes/',
 'https://www.allrecipes.com/recipes/17513/world-cuisine/latin-american/mexican/salads/', 
    'https://www.allrecipes.com/recipes/1525/world-cuisine/latin-american/mexican/bread/',
 'https://www.allrecipes.com/recipes/1216/world-cuisine/latin-american/mexican/main-dishes/burritos/',
 'https://www.allrecipes.com/recipes/16085/world-cuisine/latin-american/mexican/main-dishes/chile-rellenos/',
 'https://www.allrecipes.com/recipes/1218/world-cuisine/latin-american/mexican/main-dishes/enchiladas/',
 'https://www.allrecipes.com/recipes/1220/world-cuisine/latin-american/mexican/main-dishes/fajitas/',
 'https://www.allrecipes.com/recipes/1219/world-cuisine/latin-american/mexican/main-dishes/tacos/',
 'https://www.allrecipes.com/recipes/16562/world-cuisine/latin-american/mexican/main-dishes/tacos/fish/',
 'https://www.allrecipes.com/recipes/16082/world-cuisine/latin-american/mexican/side-dishes/rice/',
 'https://www.allrecipes.com/recipes/1215/world-cuisine/latin-american/mexican/soups-and-stews/',
 'https://www.allrecipes.com/recipes/1905/world-cuisine/latin-american/mexican/main-dishes/quesadillas/',
                'https://www.allrecipes.com/recipes/1793/world-cuisine/european/italian/appetizers/',
 'https://www.allrecipes.com/recipes/1798/world-cuisine/european/italian/bread/',
 'https://www.allrecipes.com/recipes/1791/world-cuisine/european/italian/desserts/',
 'https://www.allrecipes.com/recipes/17551/world-cuisine/european/italian/drinks/',
 'https://www.allrecipes.com/recipes/16767/world-cuisine/european/italian/main-dishes/',
 'https://www.allrecipes.com/recipes/1800/world-cuisine/european/italian/salads/',
 'https://www.allrecipes.com/recipes/1792/world-cuisine/european/italian/side-dishes/',
 'https://www.allrecipes.com/recipes/1790/world-cuisine/european/italian/soups-and-stews/',
 'https://www.allrecipes.com/recipes/1789/world-cuisine/european/italian/authentic/',
                'https://www.allrecipes.com/recipes/695/world-cuisine/asian/chinese/',
 'https://www.allrecipes.com/recipes/699/world-cuisine/asian/japanese/',
 'https://www.allrecipes.com/recipes/700/world-cuisine/asian/korean/',
 'https://www.allrecipes.com/recipes/233/world-cuisine/asian/indian/',
 'https://www.allrecipes.com/recipes/15974/world-cuisine/asian/pakistani/',
 'https://www.allrecipes.com/recipes/16100/world-cuisine/asian/bangladeshi/',
 'https://www.allrecipes.com/recipes/15937/world-cuisine/middle-eastern/persian/',
 'https://www.allrecipes.com/recipes/696/world-cuisine/asian/filipino/',
 'https://www.allrecipes.com/recipes/698/world-cuisine/asian/indonesian/',
 'https://www.allrecipes.com/recipes/701/world-cuisine/asian/malaysian/',
 'https://www.allrecipes.com/recipes/702/world-cuisine/asian/thai/',
 'https://www.allrecipes.com/recipes/703/world-cuisine/asian/vietnamese/',
               'https://www.allrecipes.com/recipes/1824/world-cuisine/middle-eastern/lebanese/',
 'https://www.allrecipes.com/recipes/1825/world-cuisine/middle-eastern/turkish/',
 'https://www.allrecipes.com/recipes/1826/world-cuisine/middle-eastern/israeli/',
               'https://www.allrecipes.com/recipes/15040/world-cuisine/australian-and-new-zealander',
 'https://www.allrecipes.com/recipes/17475/world-cuisine/african/main-dishes/',
 'https://www.allrecipes.com/recipes/17476/world-cuisine/african/side-dishes/',
 'https://www.allrecipes.com/recipes/17477/world-cuisine/african/soups-and-stews/',
 'https://www.allrecipes.com/recipes/17582/world-cuisine/african/north-african/',
 'https://www.allrecipes.com/recipes/15035/world-cuisine/african/south-african/',
 'https://www.allrecipes.com/recipes/17845/world-cuisine/african/east-african/',
               'https://www.allrecipes.com/recipes/1638/world-cuisine/canadian/toronto/',
 'https://www.allrecipes.com/recipes/15041/world-cuisine/canadian/occasions/',
 'https://www.allrecipes.com/recipes/16075/world-cuisine/canadian/vancouver/',
 'https://www.allrecipes.com/recipes/16104/world-cuisine/canadian/quebec/',
                'https://www.allrecipes.com/recipes/734/us-recipes/us-recipes-by-state/hawaii/',
 'https://www.allrecipes.com/recipes/1601/us-recipes/us-recipes-by-state/texas/',
 'https://www.allrecipes.com/recipes/1621/us-recipes/us-recipes-by-state/washington-dc/',
 'https://www.allrecipes.com/recipes/1622/us-recipes/us-recipes-by-state/colorado/',
 'https://www.allrecipes.com/recipes/1623/us-recipes/us-recipes-by-state/arizona/',
 'https://www.allrecipes.com/recipes/1632/us-recipes/us-recipes-by-state/north-carolina/',
 'https://www.allrecipes.com/recipes/1753/us-recipes/us-recipes-by-state/illinois/',
 'https://www.allrecipes.com/recipes/1754/us-recipes/us-recipes-by-state/california/',
 'https://www.allrecipes.com/recipes/1756/us-recipes/us-recipes-by-state/alaska/',
 'https://www.allrecipes.com/recipes/1757/us-recipes/us-recipes-by-state/oregon/',
 'https://www.allrecipes.com/recipes/1758/us-recipes/us-recipes-by-state/washington/',
 'https://www.allrecipes.com/recipes/1759/us-recipes/us-recipes-by-state/wyoming/',
 'https://www.allrecipes.com/recipes/1760/us-recipes/us-recipes-by-state/montana/',
 'https://www.allrecipes.com/recipes/1761/us-recipes/us-recipes-by-state/utah/',
 'https://www.allrecipes.com/recipes/1762/us-recipes/us-recipes-by-state/idaho/',
 'https://www.allrecipes.com/recipes/1763/us-recipes/us-recipes-by-state/new-mexico/',
 'https://www.allrecipes.com/recipes/1764/us-recipes/us-recipes-by-state/oklahoma/',
 'https://www.allrecipes.com/recipes/1765/us-recipes/us-recipes-by-state/georgia/',
 'https://www.allrecipes.com/recipes/1766/us-recipes/us-recipes-by-state/florida/',
 'https://www.allrecipes.com/recipes/1767/us-recipes/us-recipes-by-state/louisiana/',
 'https://www.allrecipes.com/recipes/1768/us-recipes/us-recipes-by-state/maryland/',
 'https://www.allrecipes.com/recipes/1769/us-recipes/us-recipes-by-state/new-york/',
 'https://www.allrecipes.com/recipes/1770/us-recipes/us-recipes-by-state/pennsylvania/',
 'https://www.allrecipes.com/recipes/1771/us-recipes/us-recipes-by-state/massachusetts/',
 'https://www.allrecipes.com/recipes/1772/us-recipes/us-recipes-by-state/ohio/',
 'https://www.allrecipes.com/recipes/1773/us-recipes/us-recipes-by-state/indiana/',
 'https://www.allrecipes.com/recipes/1774/us-recipes/us-recipes-by-state/michigan/',
 'https://www.allrecipes.com/recipes/1775/us-recipes/us-recipes-by-state/wisconsin/',
 'https://www.allrecipes.com/recipes/1776/us-recipes/us-recipes-by-state/minnesota/',
 'https://www.allrecipes.com/recipes/1777/us-recipes/us-recipes-by-state/missouri/',
 'https://www.allrecipes.com/recipes/1778/us-recipes/us-recipes-by-state/iowa/',
 'https://www.allrecipes.com/recipes/1779/us-recipes/us-recipes-by-state/north-dakota/',
 'https://www.allrecipes.com/recipes/1780/us-recipes/us-recipes-by-state/south-dakota/',
 'https://www.allrecipes.com/recipes/1781/us-recipes/us-recipes-by-state/kansas/',
 'https://www.allrecipes.com/recipes/1782/us-recipes/us-recipes-by-state/nebraska/',
 'https://www.allrecipes.com/recipes/1808/us-recipes/us-recipes-by-state/connecticut/',
 'https://www.allrecipes.com/recipes/1809/us-recipes/us-recipes-by-state/maine/',
 'https://www.allrecipes.com/recipes/1810/us-recipes/us-recipes-by-state/new-hampshire/',
 'https://www.allrecipes.com/recipes/1811/us-recipes/us-recipes-by-state/rhode-island/',
 'https://www.allrecipes.com/recipes/1812/us-recipes/us-recipes-by-state/vermont/',
 'https://www.allrecipes.com/recipes/1813/us-recipes/us-recipes-by-state/delaware/',
 'https://www.allrecipes.com/recipes/1814/us-recipes/us-recipes-by-state/new-jersey/',
 'https://www.allrecipes.com/recipes/1815/us-recipes/us-recipes-by-state/virginia/',
 'https://www.allrecipes.com/recipes/1816/us-recipes/us-recipes-by-state/south-carolina/',
 'https://www.allrecipes.com/recipes/1817/us-recipes/us-recipes-by-state/alabama/',
 'https://www.allrecipes.com/recipes/1818/us-recipes/us-recipes-by-state/arkansas/',
 'https://www.allrecipes.com/recipes/1819/us-recipes/us-recipes-by-state/kentucky/',
 'https://www.allrecipes.com/recipes/1820/us-recipes/us-recipes-by-state/tennessee/',
 'https://www.allrecipes.com/recipes/2593/us-recipes/us-recipes-by-state/west-virginia/',
 'https://www.allrecipes.com/recipes/2824/us-recipes/us-recipes-by-state/mississippi/',
 'https://www.allrecipes.com/recipes/2832/us-recipes/us-recipes-by-state/nevada/']

In [190]:
US = 'United-States'
US = ' '.join([US] * 51)
US = US.split(' ')
US = [x.replace('-', ' ') for x in US]

In [199]:
countries = ['Mexico','Mexico','Mexico','Mexico','Mexico','Mexico',
             'Mexico','Mexico','Mexico','Mexico','Mexico','Mexico',
             'Mexico','Mexico','Mexico','Mexico','Mexico','Italy','Italy','Italy',
             'Italy','Italy','Italy','Italy','Italy','Italy',
             'China', 'Japan', 'Korea', 'India', 'Pakistan', 
            'Bangladesh', 'Persia', 'Philippines', 'Indonesia', 'Malaysia', 'Thailand', 'Vietnam', 'Lebanon', 
             'Turkey', 'Israel','Australia', 'Africa','Africa','Africa','North Africa', 'South Africa', 'East Africa',
             'Canada','Canada','Canada','Canada']
countries = countries + US

In [200]:
len(country_links)

103

In [201]:
len(countries)

103

In [209]:
from tqdm.auto import tqdm

In [273]:
def getRecipelinks(countries,country_links):
    '''
    This function takes in a list of country page links and returns a list of recipe links and a list of countries.
    ------
    input: list
    output: list, list
    
    '''
    all_recipe_links = []
    all_countries = []
    for country, country_link in tqdm(zip(countries,country_links)):
        all_country_links = []
        for i in range(1,5):
            country_url = country_link +'?page=' + str(i)
            response = requests.get(country_url)
            soup = BeautifulSoup(response.text, 'lxml')
            country_links_page = [x.find('a')['href'] for x in soup.find_all(class_ = 'fixed-recipe-card__info')]
            all_country_links.append(country_links_page)
        all_country_links = list(itertools.chain(*all_country_links))
        for i in all_country_links:
            all_countries.append(country)
        all_recipe_links.append(all_country_links)
    all_recipe_links = list(itertools.chain(*all_recipe_links))
    return all_countries, all_recipe_links

In [274]:
all_countries, all_recipe_links = getRecipelinks(countries, country_links)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [275]:
len(all_countries)

6298

In [276]:
len(all_recipe_links)

6298

In [278]:
example = all_recipe_links[0]

In [281]:
response = requests.get(example)
soup = BeautifulSoup(response.text, 'lxml')

In [285]:
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>
<html lang="en-us">
<head>
<title>Mexican Oxtail Beef Soup Recipe - Allrecipes.com</title>
<script async="true" src="https://secureimages.allrecipes.com/assets/deployables/v-1.175.0.5165/karma.bundled.js"></script>
<!--Make our website baseUrl available to the client-side code-->
<script type="text/javascript">
        var AR = AR || {};

        AR.segmentWriteKey = "RnmsxUrjIjM7W62olfjKgJrcsVlxe68V";
        AR.baseWebsiteUrl = 'https://www.allrecipes.com';
window.dataLayer={"version":"1.0","pageInstanceId":"www.allrecipes.com/recipe/214090/mexican-oxtail-beef-soup/","externalLinkId":"","page":{"pageInfo":{"pageId":"214090","pageName":"/recipe/214090/mexican-oxtail-beef-soup/","destinationUrl":"https://www.allrecipes.com/recipe/214090/mexican-oxtail-beef-soup/","sysEnv":"RD0003FFA86350","variant":"Control","abTestName":"RDP_Video_Step","version":"20190401","issueDate":"10/10/2019 23:35:53","effectiveDate":"10/10/2019 23:35:53","domain":"w

In [308]:
example_ingredients = [x.text for x in soup.find_all(class_ = 'recipe-ingred_txt added')]

In [309]:
example_ingredients

['2 tablespoons olive oil',
 '2 pounds beef oxtail, cut into pieces',
 '1 pound cubed beef stew meat (optional)',
 '1 cube beef bouillon',
 '1 onion, chopped',
 '2 stalks celery, chopped',
 '1/2 teaspoon chili powder',
 '3/4 teaspoon ground cumin',
 'salt and pepper to taste',
 '4 ears corn on the cob, broken in half',
 '3 carrots, coarsely chopped',
 '2 russet potatoes, cut into bite-sized pieces',
 '1/3 cup lentils, picked over and rinsed',
 '1/3 cup long grain rice',
 '1 cup frozen mixed vegetables (optional)',
 '1 head cabbage, cored and cut into 8 wedges',
 '8 corn tortillas (optional)']

In [318]:
def listRecipePages(all_recipe_links):
    '''
    This function takes in a list of links to country pages and returns the content
    on the page in a list and the countries in a list
    ----
    input: list
    output: list, list
    '''
    food_pages = []
    for link in tqdm(all_recipe_links):
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'lxml')
        ingredients = [x.text for x in soup.find_all(class_ = 'recipe-ingred_txt added')]
        for each in ingredients:
            new = ' '.join(map(str, ingredients))
        food_pages.append(new)
    food_pages = [page.split(' ') for page in food_pages]
    return food_pages

In [319]:
food_pages = listRecipePages(all_recipe_links)

HBox(children=(IntProgress(value=0, max=6298), HTML(value='')))




**Pull out a list of foods, countries, and groups**

In [325]:
def countryFoodDF(df_common_food, food_pages, countries):
    '''
    This function takes in a dataframe of common foods and a list of text scraped from 
    each country page. It returns a dataframe of all foods, food groups, and associated countries
    -----
    inputs: DataFrame, list
    outputs: DataFrame
    '''
    list_foods = []
    list_countries = []
    list_groups = []
    i = 0
    for page in tqdm(food_pages):
        for food,group in zip(df_common_food['FOOD NAME'],df_common_food['GROUP']):
            if food in page:
                list_foods.append(food)
                list_countries.append(countries[i])
                list_groups.append(group)
        i += 1
    df_country_foods = pd.DataFrame({'food':list_foods, 'group':list_groups, 'country': list_countries})
    return df_country_foods

In [326]:
df_country_foods_3 = countryFoodDF(df_common_food, food_pages, all_countries)

HBox(children=(IntProgress(value=0, max=6298), HTML(value='')))




In [327]:
df_country_foods_3

Unnamed: 0,food,group,country
0,cumin,Herbs and Spices,Mexico
1,olive,Vegetables,Mexico
2,rice,Cereals and cereal products,Mexico
3,corn,Cereals and cereal products,Mexico
4,salt,Baking goods,Mexico
5,chili,Dishes,Mexico
6,stew,Dishes,Mexico
7,water,Beverages,Mexico
8,water,Beverages,Mexico
9,anise,Herbs and Spices,Mexico


In [328]:
with open('data/df_country_foods_3.pickle', 'wb') as to_write:
    pickle.dump(df_country_foods_3, to_write)

In [335]:
def cleanGroups(df_country_foods):
    '''
    This function takes in a dataframe of coutries, foods, and food groups. It returns a dataframe of 
    foods grouped into the 5 basic food groups.
    -----
    input: DataFrame
    output: DataFrame
    '''
    replace_groups = {'Teas': 'extra','Pulses': 'protein', 'Coffee and coffee products': 'extra', 
                 'Gourds': 'fruits', 'Vegetables': 'vegetables', 'Fruits':'fruits', 
                 'Cereals and cereal products':'grains', 'Milk and milk products':'dairy', 'Baking goods':'extra',
                 'Beverages':'extra', 'Beverages':'extra', 'Aquatic foods':'protein', 'Eggs':'protein', 
                 'Confectioneries':'extra', 'Cocoa and cocoa products':'extra', 'Animal foods':'protein',
                 'Nuts':'protein', 'Snack foods':'extra', 'Soy':'protein'}
    df_country_foods = df_country_foods[df_country_foods.food != 'water']
    df_country_foods = df_country_foods[df_country_foods.group != 'Herbs and spices']
    df_country_foods = df_country_foods[df_country_foods.group != 'Herbs and Spices']
    df_country_foods = df_country_foods[df_country_foods.group != 'Dishes']
    df_country_foods = df_country_foods[df_country_foods.group != 'Fats and oils']
    df_country_foods = df_country_foods.replace({'group':replace_groups})
    df_country_foods = df_country_foods.reset_index(drop = True)
    return df_country_foods

In [336]:
df_country_foods_3 = cleanGroups(df_country_foods_3)

In [338]:
df_country_foods_3['country'].unique()

array(['Mexico', 'Italy', 'China', 'Japan', 'Korea', 'India', 'Pakistan',
       'Bangladesh', 'Persia', 'Philippines', 'Indonesia', 'Malaysia',
       'Thailand', 'Vietnam', 'Lebanon', 'Turkey', 'Israel', 'Australia',
       'Africa', 'North Africa', 'South Africa', 'East Africa', 'Canada',
       'United States'], dtype=object)