## Import things

In [7]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import re

In [26]:
import matplotlib.pyplot as plt

In [27]:
import pickle

## Read in Common Foods dataset

In [28]:
def cleanCommonFoods(url):
    '''
    This function takes in the url of the common foods dataset and outputs
    a clean dataframe with lower case food names
    -----
    Input: string
    Output: DataFrame
    '''
    df_common_food = pd.read_csv(url)
    df_common_food['FOOD NAME'] = [food.lower() for food in df_common_food['FOOD NAME']]
    return df_common_food

In [29]:
url = 'https://query.data.world/s/ssqiubyapfnrrhrf2uhqqezfzhlr6q'

In [30]:
df_common_food = cleanCommonFoods(url)

In [31]:
with open('data/df_common_food.pickle', 'wb') as to_write:
    pickle.dump(df_common_food, to_write)

## Web scraping foodsaroundtheworld

In [32]:
# Checking robots
url = 'http://www.foodbycountry.com/robots.txt'
response = requests.get(url)
# Checking status code
response.status_code

200

In [33]:
url = 'http://www.foodbycountry.com/'

In [34]:
def getCountryLinks(url):
    '''
    This function takes in the foodbycountry url and outputs a list of links
    to specific country pages
    -----
    input: string
    output: list
    '''
    response = requests.get(url)
    soup_food = BeautifulSoup(response.text, 'lxml')
    # Getting names of countries in a table based on class tag
    country_links = [x.get('href') for x in soup_food.find_all(class_= 'list-group-item col-sm-4')]
    return country_links

In [35]:
country_links = getCountryLinks(url)

In [36]:
def listCountriesPages(country_links):
    '''
    This function takes in a list of links to country pages and returns the content
    on the page in a list and the countries in a list
    ----
    input: list
    output: list, list
    '''
    countries = []
    food_pages = []
    for link in country_links:
        countries.append(link[link.find('/')+1:link.find('.')])
        country_url = url + link
        response = requests.get(country_url)
        soup = BeautifulSoup(response.text, 'lxml')
        a_page =[x.text.replace('\n', ' ').lower() for x in soup.find_all('p')]
        for each in a_page:
            new = ''.join(map(str, a_page))
        food_pages.append(new)
    food_pages = [page.split(' ') for page in food_pages]
    return countries, food_pages

In [37]:
countries, food_pages = listCountriesPages(country_links)

## Create a DF of food, food group, and country

In [38]:
def countryFoodDF(df_common_food, food_pages):
    '''
    This function takes in a dataframe of common foods and a list of text scraped from 
    each country page. It returns a dataframe of all foods, food groups, and associated countries
    -----
    inputs: DataFrame, list
    outputs: DataFrame
    '''
    list_foods = []
    list_countries = []
    list_groups = []
    i = 0
    for page in food_pages:
        for food,group in zip(df_common_food['FOOD NAME'],df_common_food['GROUP']):
            if food in page:
                list_foods.append(food)
                list_countries.append(countries[i])
                list_groups.append(group)
        i += 1
    df_country_foods = pd.DataFrame({'food':list_foods, 'group':list_groups, 'country': list_countries})
    return df_country_foods

In [39]:
df_country_foods = countryFoodDF(df_common_food, food_pages)

In [40]:
with open('data/df_country_foods.pickle', 'wb') as to_write:
    pickle.dump(df_country_foods, to_write)

## Cleaning Foods and Grouping

In [97]:
with open('data/df_country_foods_1.pickle', 'rb') as read_file:
    df_country_foods_1 = pickle.load(read_file)

Pairing down the food groups (5 main groups according to https://www.eatforhealth.gov.au/food-essentials/five-food-groups:

    1) breads, cereals, rice, pasta, noodles and other grains

    2) vegetables and legumes
    
    3) fruit

    4) milk, yoghurt, cheese and/or alternatives

    5) lean meat, fish, poultry, eggs, nuts and legumes.

    6) Other: Baking products, Sugar, confectionaries, beverages, coffee, tea, etc.

I'm going to remove herbs and spices on the grounds that it doesn't make up a large enough volume of the food.

Removing dishes, because foods are not specified.

Removing fats and oils, because these are encapsulated by other food groups


Pulse = legume = 5
Gourds = fruits = 3


Legumes are in both the vegetables category and meats (this is because nutritionally they can substitute for 
vegetables. I'm going to leave them as a seperate feature for now.


In [99]:
df_country_foods.group.unique()

array(['Teas', 'Pulses', 'Coffee and coffee products', 'Gourds',
       'Herbs and Spices', 'Vegetables', 'Fruits',
       'Cereals and cereal products', 'Milk and milk products',
       'Baking goods', 'Beverages', 'Dishes', 'Aquatic foods', 'Eggs',
       'Confectioneries', 'Cocoa and cocoa products', 'Animal foods',
       'Nuts', 'Snack foods', 'Soy', 'Fats and oils'], dtype=object)

In [43]:
replace_groups = {'Teas': 'extra','Pulses': 'legumes', 'Coffee and coffee products': 'extra', 
                 'Gourds': 'fruits', 'Vegetables': 'vegetables', 'Fruits':'fruits', 
                 'Cereals and cereal products':'grains', 'Milk and milk products':'dairy', 'Baking goods':'extra',
                 'Beverages':'extra', 'Beverages':'extra', 'Aquatic foods':'meats', 'Eggs':'meats', 
                 'Confectioneries':'extra', 'Cocoa and cocoa products':'extra', 'Animal foods':'meats',
                 'Nuts':'meats', 'Snack foods':'extra', 'Soy':'meats'}

In [44]:
df_country_foods = df_country_foods[df_country_foods.group != 'Herbs and Spices']
df_country_foods = df_country_foods[df_country_foods.group != 'Dishes']
df_country_foods = df_country_foods[df_country_foods.group != 'Fats and oils']

In [45]:
df_country_foods = df_country_foods.replace({'group':replace_groups})

In [46]:
df_country_foods = df_country_foods.reset_index(drop = True)

Putting legumes in the meat group and renaming this group "protein"

In [47]:
df_country_foods = df_country_groups.replace({'legumes':'protein', 'meats':'protein'})

In [48]:
df_country_foods.group.unique()

array(['dairy', 'extra', 'fruits', 'grains', 'protein', 'vegetables'],
      dtype=object)

**A few notes thus far**: 

    1) I'm removing the granulated regions below to be able to better group the data. The process of this
        has caused me to think a bit more about food culture. Grouping all of the US together is a huge
        assumption. Not sure this will turn up many signals. I have a new thought
        New Thought - Grouping by cultural origin. I actual think that some of these countries which are still
        fairly siloed may turn out a better signal
    
    2) I'm also not sure if these groups are grouping the right nutritional patterns. Might be worth it to find a 
        a dataset of nutritional content in these foods then group by that

## Cleaning the countries

In [49]:
df_country_foods['country'] = [country.replace('-', ' ') for country in df_country_foods['country']]

In [50]:
for i,country in enumerate(df_country_foods['country']):
    if re.search(r'^United States', country):
        df_country_foods.loc[i, 'country'] = 'United States'

In [51]:
for i,country in enumerate(df_country_foods['country']):
    if re.search(r'^Brazil', country):
        df_country_foods.loc[i, 'country'] = 'Brazil'

In [52]:
for i,country in enumerate(df_country_foods['country']):
    if re.search(r'^Australia', country):
        df_country_foods.loc[i, 'country'] = 'Australia'

In [53]:
for i,country in enumerate(df_country_foods['country']):
    if re.search(r'^Canada', country):
        df_country_foods.loc[i, 'country'] = 'Canada'

Grouping by count of foods in each food group

In [54]:
df_country_groups = df_country_foods.groupby(['country','group'], as_index = False)['food'].count()

Transforming to get normalized count

In [55]:
df_country_groups['food'] = df_country_groups.groupby('country', as_index = False)['food'].transform(lambda x: x/x.sum())

In [56]:
with open('data/df_country_groups.pickle', 'wb') as to_write:
    pickle.dump(df_country_groups, to_write)

## Combining Mean Height Dataset

In [58]:
with open('data/df_country_groups.pickle', 'rb') as read_file:
    df_country_groups = pickle.load(read_file)

In [59]:
df_height = pd.read_csv('data/height_country.txt')

In [60]:
df_height['Year of birth'].max()

1996

In [61]:
# Filtering only most recent mean height
df_height_recent = df_height[df_height['Year of birth'] == 1996]

In [62]:
df_height_recent = df_height_recent.rename(columns = {'Year of birth':'year', 'Mean height (cm)':'mean', 
                         'Mean height lower 95% uncertainty interval (cm)':'lower95',
                        'Mean height upper 95% uncertainty interval (cm)' : 'upper95',
                                                     'Country':'country'})

In [63]:
combined_groups = df_country_groups.set_index('country').join(df_height_recent.set_index('country'), how = 'left')

Putting legumes in the meats group, because it is essencially a "protein group"

In [66]:
combined_men_group = combined_groups[combined_groups['Sex'] == 'Men']
combined_women_group = combined_groups[combined_groups['Sex'] == 'Women']

In [67]:
cols = ['ISO', 'Sex', 'year']

In [68]:
combined_men_group = combined_men_group.drop(cols, axis = 1)
combined_women_group = combined_women_group.drop(cols, axis = 1)

In [82]:
food_men = combined_men_group[['food', 'group']]
food_women = combined_women_group[['food', 'group']]

In [None]:
food_men = food_men.pivot(columns = 'group', values = 'food')

In [85]:
food_women = food_women.pivot(columns = 'group', values = 'food')

In [87]:
# Note: there are still a lot of food groups with zero...this is a reason to fill out dataset
food_men = food_men.replace(np.nan,0)
food_women = food_women.replace(np.nan,0)

In [88]:
y_men = combined_men_group.groupby('country')['mean'].mean()
y_women = combined_women_group.groupby('country')['mean'].mean()

In [89]:
combined_men_group = food_men.join(y_men, how = 'left')
combined_women_group = food_women.join(y_women, how = 'left')

In [91]:
with open('data/combined_women_group.pickle', 'wb') as to_write:
    pickle.dump(combined_women_group, to_write)

In [92]:
with open('data/combined_men_group.pickle', 'wb') as to_write:
    pickle.dump(combined_men_group, to_write)

In [168]:
#Tried to use Selenium for something that BeautifulSoup can do better
#driver = webdriver.Chrome()
#driver.get(url)

In [169]:
##country_algeria=driver.find_element_by_class_name('list-group-item col-sm-4') # Find a country name
##country_algeria.send_keys() # Click it