## Import things

In [54]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import chromedriver_binary
import re

## Read in Common Foods dataset

In [2]:
df_food = pd.read_csv('https://query.data.world/s/ssqiubyapfnrrhrf2uhqqezfzhlr6q')

In [64]:
# Mainly using food name column, so casting to lower case to ensure accurate lookup
df_food['FOOD NAME'] = [food.lower() for food in df_food['FOOD NAME']]

In [65]:
df_food.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,angelica,Angelica keiskei,Herbs and Spices,Herbs
1,savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages
2,silver linden,Tilia argentea,Herbs and Spices,Herbs
3,kiwi,Actinidia chinensis,Fruits,Tropical fruits
4,allium (onion),Allium,Vegetables,Onion-family vegetables


## Web scraping foodsaroundtheworld

In [11]:
# Checking robots
url = 'http://www.foodbycountry.com/robots.txt'
response = requests.get(url)
# Checking status code
response.status_code

In [13]:
url = 'http://www.foodbycountry.com/'

In [329]:
def getCountryLinks(url):
    '''
    This function takes in the foodbycountry url and outputs a list of links
    to specific country pages
    -----
    input: string
    output: list
    '''
    response = requests.get(url)
    soup_food = BeautifulSoup(response.text, 'lxml')
    # Getting names of countries in a table based on class tag
    country_links = [x.get('href') for x in soup_food.find_all(class_= 'list-group-item col-sm-4')]
    return country_links

In [330]:
country_links = getCountryLinks(url)

In [323]:
def listCountriesPages(country_links):
    '''
    This function takes in a list of links to country pages and returns the content
    on the page in a list and the countries in a list
    ----
    input: list
    output: list, list
    '''
    countries = []
    food_pages = []
    for link in country_table:
        countries.append(link[link.find('/')+1:link.find('.')])
        country_url = url + link
        response = requests.get(country_url)
        soup = BeautifulSoup(response.text, 'lxml')
        a_page =[x.text.replace('\n', ' ').lower() for x in soup.find_all('p')]
        for each in a_page:
            new = ''.join(map(str, a_page))
        food_pages.append(new)
    return countries, food_pages

In [324]:
countries, food_pages = listCountriesPages(country_links)

## Combining foodsaroundtheworld and Common Foods

In [333]:
def countryFoodDict(df_food, food_pages):
    '''
    This function takes in a dataframe of common foods and a list of text scraped from 
    each country page. It returns a dictionary dict[country]: list(foods)
    -----
    inputs: DataFrame, list
    outputs: dictionary
    '''
    all_countries = {}
    for food in df_food['FOOD NAME']:
        for i,v in enumerate(food_pages):
            if food in v:
                if countries[i] in all_countries:
                    all_countries[countries[i]].append(food)
                else:
                    all_countries[countries[i]] = [food]
    return all_countries

In [334]:
all_countries = countryFoodDict(df_food, food_pages)

In [344]:
all_countries

{'Pakistan': ['kiwi',
  'garlic',
  'oat',
  'tea',
  'papaya',
  'chickpea',
  'watermelon',
  'lime',
  'coriander',
  'saffron',
  'cumin',
  'lentils',
  'apple',
  'mango',
  'nutmeg',
  'rice',
  'date',
  'pistachio',
  'apricot',
  'almond',
  'potato',
  'corn',
  'ginger',
  'banana',
  'okra',
  'yam',
  'wheat',
  'cinnamon',
  'yogurt',
  'ice cream',
  'gin',
  'rum',
  'cake',
  'sauce',
  'salt',
  'cream',
  'sugar',
  'meatball',
  'water',
  'dried milk',
  'chili',
  'stew',
  'spread',
  'relish',
  'flour',
  'cabbage',
  'herbs and spices',
  'fruits'],
 'Philippines': ['kiwi',
  'garlic',
  'pineapple',
  'peanut',
  'oat',
  'tea',
  'papaya',
  'chickpea',
  'lime',
  'lemon',
  'apple',
  'mango',
  'olive',
  'rice',
  'guava',
  'pear',
  'eggplant',
  'potato',
  'mung bean',
  'corn',
  'ginger',
  'banana',
  'buffalo',
  'coconut',
  'shrimp',
  'persimmon',
  'cheese',
  'eggs',
  'bean',
  'gin',
  'vinegar',
  'cake',
  'pastry',
  'sauce',
  'salt',

In [348]:
def uniqueFoods(all_countries):
    '''
    This function takes in a dictionary of paired countries and foods. It returns all unique foods.
    ----
    input: dictionary
    output: list
    '''
    foods = []
    for key, value in sorted(all_countries.items()):
        for food in value:
            if food not in foods:
                foods.append(food)
    return foods

In [349]:
foods = uniqueFoods(all_countries)

## Final DF = countries, and binary classification of whether common foods show up in popular recipes

In [353]:
def finaldf(foods, countries, all_countries):
    '''
    This function takes in a list of unique foods and countries as well as the paired 
    dictionary of country: foods. It returns a dataframe with columns indicating foods 
    and countries. If the associated country has the food a 1 exists in that cell. If 
    not, a 0 exists
    -----
    inputs: list, list, dictionary
    output: DataFrame
    '''
    all_df = {}
    for food in foods:
        country_food = []
        for key, value in sorted(all_countries.items()):
            if food in value:
                country_food.append(1)
            else:
                country_food.append(0)
        all_df[food] = country_food
    all_df['country'] = countries
    final_df = pd.DataFrame(all_df)
    return final_df

In [354]:
final_df = finaldf(foods, countries, all_countries)

In [355]:
final_df.head()

Unnamed: 0,garlic,oat,rape,tea,chickpea,coffee,coriander,saffron,cucumber,cumin,...,marshmallow,lard,dungeness crab,burrito,star anise,shallot,abalone,butternut,butternut squash,country
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Algeria
1,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Argentina
2,0,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Australia
3,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Australia-Aborigines-and-Bush-Tucker
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Brazil


In [319]:
import pickle

In [320]:
#Pickle output for later
with open('data/food_df.pickle', 'wb') as to_write:
    pickle.dump(final_df, to_write)

In [357]:
df_height = pd.read_csv('data/height_country.txt')

In [358]:
df_height.head()

Unnamed: 0,Country,ISO,Sex,Year of birth,Mean height (cm),Mean height lower 95% uncertainty interval (cm),Mean height upper 95% uncertainty interval (cm)
0,Afghanistan,AFG,Men,1896,161.164095,154.484285,167.754033
1,Afghanistan,AFG,Men,1897,161.196286,154.571603,167.659618
2,Afghanistan,AFG,Men,1898,161.228297,154.70734,167.602576
3,Afghanistan,AFG,Men,1899,161.260727,154.835644,167.528113
4,Afghanistan,AFG,Men,1900,161.293068,154.95954,167.508077


In [359]:
df_height.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40400 entries, 0 to 40399
Data columns (total 7 columns):
Country                                            40400 non-null object
ISO                                                40400 non-null object
Sex                                                40400 non-null object
Year of birth                                      40400 non-null int64
Mean height (cm)                                   40400 non-null float64
Mean height lower 95% uncertainty interval (cm)    40400 non-null float64
Mean height upper 95% uncertainty interval (cm)    40400 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 2.2+ MB


In [360]:
df_height['Year of birth'].max()

1996

In [362]:
df_height_recent = df_height[df_height['Year of birth'] == 1996]

In [364]:
df_height_recent.columns

Index(['Country', 'ISO', 'Sex', 'Year of birth', 'Mean height (cm)',
       'Mean height lower 95% uncertainty interval (cm)',
       'Mean height upper 95% uncertainty interval (cm)'],
      dtype='object')

In [377]:
df_height_recent = df_height_recent.rename(columns = {'Year of birth':'year', 'Mean height (cm)':'mean', 
                         'Mean height lower 95% uncertainty interval (cm)':'lower95',
                        'Mean height upper 95% uncertainty interval (cm)' : 'upper95',
                                                     'Country':'country'})

In [378]:
df_height_recent.head()

Unnamed: 0,country,ISO,Sex,year,mean,lower95,upper95
100,Afghanistan,AFG,Men,1996,165.255861,159.320913,171.273802
201,Albania,ALB,Men,1996,173.388856,171.386749,175.400316
302,Algeria,DZA,Men,1996,170.0718,167.129743,172.848693
403,American Samoa,ASM,Men,1996,176.045121,172.974744,179.083712
504,Andorra,AND,Men,1996,176.060167,170.108021,182.0402


In [374]:
# Renaming final to call it easier in future notebooks
food_df = final_df

In [379]:
combined = food_df.set_index('country').join(df_height_recent.set_index('country'), how = 'left')

In [381]:
combined_men = combined[combined['Sex'] =='Men']
combined_women = combined[combined['Sex'] == 'Women']

In [383]:
combined_men.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Algeria to Zimbabwe
Columns: 298 entries, garlic to upper95
dtypes: float64(4), int64(292), object(2)
memory usage: 109.8+ KB


In [385]:
combined_men = combined_men.drop(columns = ['ISO', 'year','Sex'])

In [386]:
combined_women = combined_women.drop(columns = ['ISO', 'year','Sex'])

In [387]:
with open('data/combined_men.pickle', 'wb') as to_write:
    pickle.dump(combined_men, to_write)

In [388]:
with open('data/combined_women.pickle', 'wb') as to_write:
    pickle.dump(combined_women, to_write)

In [168]:
#Tried to use Selenium for something that BeautifulSoup can do better
#driver = webdriver.Chrome()
#driver.get(url)

In [169]:
##country_algeria=driver.find_element_by_class_name('list-group-item col-sm-4') # Find a country name
##country_algeria.send_keys() # Click it