## Getting more Data

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import re
import matplotlib.pyplot as plt
import pickle

In [5]:
with open('data/df_common_food.pickle', 'rb') as read_file:
    df_common_food = pickle.load(read_file)

## Web scraping taste of home (n=80 recipes)

In [6]:
# Checking robots
url = 'https://www.tasteofhome.com/collection/travel-around-the-world-in-80-meals/.robots.txt
response = requests.get(url)
# Checking status code
response.status_code

200

In [8]:
print(response.text)

User-agent: *
Disallow: /wp-admin/ 
Allow: /wp-admin/admin-ajax.php
Disallow: /login/index*
Disallow: */print/

Sitemap: https://www.tasteofhome.com/sitemap_index.xml


**Get the name of the countries and links to the recipes**

In [208]:
def getCountryNameLink(url):
    '''
    This function takes in the foodbycountry url and outputs a list of countries and a list of links
    to specific recipe pages
    -----
    input: string
    output: list, list
    '''
    response = requests.get(url)
    soup_food = BeautifulSoup(response.text, 'lxml')
    countries = [x.find('b') for x in soup_food.find_all(class_= 'listicle-page__content')]
    countries[11] = 'United States'
    countries[0:11] = [x.text for x in countries[0:11]]
    countries[12:] = [x.text for x in countries[12:]]
    countries = [x.replace(':','') for x in countries]
    # Getting names of countries in a table based on class tag
    recipe_links = [x for x in soup_food.find_all(class_ = 'listicle-page__cta-button')]
    recipe_links = [x.find('a')['href'] for x in recipe_links]
    return countries, recipe_links

In [209]:
url = 'https://www.tasteofhome.com/collection/travel-around-the-world-in-80-meals/'

In [210]:
countries, recipe_links = getCountryNameLink(url)

**Scrape each recipe link**

In [211]:
def listCountriesPages(recipe_links):
    '''
    This function takes in a list of links to country pages and returns the content
    on the page in a list and the countries in a list
    ----
    input: list
    output: list, list
    '''
    food_pages = []
    for link in recipe_links:
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'lxml')
        ingredients = [x for x in soup.find_all(class_= 'recipe-ingredients__list recipe-ingredients__collection')]
        ingredients = [x.text for x in ingredients[0].find_all('li')]
        for each in ingredients:
            new = ' '.join(map(str, ingredients))
        food_pages.append(new)
    food_pages = [page.split(' ') for page in food_pages]
    return food_pages

In [212]:
food_pages = listCountriesPages(recipe_links)

**Pull out a list of foods, countries, and groups**

In [215]:
def countryFoodDF(df_common_food, food_pages, countries):
    '''
    This function takes in a dataframe of common foods and a list of text scraped from 
    each country page. It returns a dataframe of all foods, food groups, and associated countries
    -----
    inputs: DataFrame, list
    outputs: DataFrame
    '''
    list_foods = []
    list_countries = []
    list_groups = []
    i = 0
    for page in food_pages:
        for food,group in zip(df_common_food['FOOD NAME'],df_common_food['GROUP']):
            if food in page:
                list_foods.append(food)
                list_countries.append(countries[i])
                list_groups.append(group)
        i += 1
    df_country_foods = pd.DataFrame({'food':list_foods, 'group':list_groups, 'country': list_countries})
    return df_country_foods

In [219]:
df_country_foods_2 = countryFoodDF(df_common_food, food_pages, countries)

In [225]:
with open('data/df_country_foods_2.pickle', 'wb') as to_write:
    pickle.dump(df_country_foods_2, to_write)

In [226]:
def cleanGroups(df_country_foods):
    '''
    This function takes in a dataframe of coutries, foods, and food groups. It returns a dataframe of 
    foods grouped into the 5 basic food groups.
    -----
    input: DataFrame
    output: DataFrame
    '''
    replace_groups = {'Teas': 'extra','Pulses': 'protein', 'Coffee and coffee products': 'extra', 
                 'Gourds': 'fruits', 'Vegetables': 'vegetables', 'Fruits':'fruits', 
                 'Cereals and cereal products':'grains', 'Milk and milk products':'dairy', 'Baking goods':'extra',
                 'Beverages':'extra', 'Beverages':'extra', 'Aquatic foods':'protein', 'Eggs':'protein', 
                 'Confectioneries':'extra', 'Cocoa and cocoa products':'extra', 'Animal foods':'protein',
                 'Nuts':'protein', 'Snack foods':'extra', 'Soy':'protein'}
    df_country_foods = df_country_foods[df_country_foods.group != 'Herbs and spices']
    df_country_foods = df_country_foods[df_country_foods.group != 'Dishes']
    df_country_foods = df_country_foods[df_country_foods.group != 'Fats and oils']
    df_country_foods = df_country_foods.replace({'group':replace_groups})
    df_country_foods = df_country_foods.reset_index(drop = True)
    return df_country_foods

In [227]:
df_country_foods_2 = cleanGroups(df_country_foods_2)