In [2]:
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import requests_html
from requests_html import HTMLSession, AsyncHTMLSession
import pathlib
import time
import datetime
import random
import os
from tqdm import tqdm
import altair as alt
import AP_Apple_funcs as aaf

sns.set(style='ticks')
plt.style.use('dark_background') # use this if plotting in a dark themed notebook
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
url_dir_path = aaf.convert_path(r"C:\Users\CDT - Admin\PycharmProjects\AP_Apple_Meal_Recommendations\Combined_CK_URLs.csv")

combined_urls = pd.read_csv(url_dir_path)
url_list = combined_urls['calorieking_urls'].tolist()

url_s = pd.Series(url_list)
print('With duplicates:', len(url_s))
url_s = url_s.drop_duplicates()
print('Without duplicates:',len(url_list))

With duplicates: 2065
Without duplicates: 2065


In [5]:
def scrape_data(url):

    print('Pulling data...')

    headers = {

        'authority': 'UVA Center For Diabetes Technology (asl4af@virginia.edu)',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8'}

    session = HTMLSession()
    r2 = session.get(url,headers=headers)

    r = requests.get(url,headers)
    ck_df = pd.read_html(r.text)
    #ck_df[1]

    nutrition_dict = {
        'brand_name': r2.html.find('li:nth-of-type(5) a.MuiLink-underlineHover',first=True).text,
        'meal_type': r2.html.find('li:nth-of-type(7) a',first=True).text,
        'item_name': r2.html.find('a.jss367',first=True).text,
        'serving_size': r2.html.find('.MuiInput-underline div',first=True).text,
        'servings_per_container': r2.html.find('.MuiInput-underline input.MuiInputBase-input',first=True).text,
        'metrics': ck_df[1][0],
        'values': ck_df[1][1]}

    nutrition_df = pd.DataFrame.from_dict(nutrition_dict)
    nutrition_df['metrics'] = nutrition_df['metrics'].str.replace('\d+','')
    nutrition_df.iloc[0][4] = ck_df[1].iloc[0,0].split()[1]

    nutrition_df_formatted = nutrition_df.pivot('item_name','metrics','values')
    nutrition_df_formatted = nutrition_df_formatted.reset_index()
    nutrition_df_formatted = nutrition_df_formatted.drop('% DV*',axis=1)
    nutrition_df_formatted.columns = nutrition_df_formatted.columns.str.strip()
    nutrition_df_formatted['Brand Name'] = nutrition_dict['brand_name']
    nutrition_df_formatted['Meal Type'] = nutrition_dict['meal_type']
    nutrition_df_formatted['Serving Size'] = nutrition_dict['serving_size']
    nutrition_df_formatted['Servings Per Container'] = nutrition_dict['servings_per_container']
    nutrition_df_formatted['Source URL'] = url

    session.close()

    output_dict = {'nutrition_df_formatted': nutrition_df_formatted,
                  'nutrition_df': nutrition_df}

    return output_dict

In [12]:
web_data_list = []
formatted_list = []

for link in tqdm(url_list):

  sleep_time_varying = random.randint(7,13)
  print()
  print(f"Scraping data for: {link}...",'\n')

  wb_data = scrape_data(link)
  web_data_list.append(wb_data)
  formatted_list.append(wb_data['nutrition_df_formatted'])
  #print(wb_data)
  #print(formated_list)
  print(wb_data['nutrition_df_formatted'],'\n')

  print(f"Percentage Complete: {(len(formatted_list) / len(url_list)) * 100}")

  print(f"Data pull complete going to sleep for {sleep_time_varying} seconds...",'\n')
  time.sleep(sleep_time_varying)


  nutrition_df['metrics'] = nutrition_df['metrics'].str.replace('\d+','')
  nutrition_df['metrics'] = nutrition_df['metrics'].str.replace('\d+','')
  0%|          | 1/2065 [00:24<13:54:08, 24.25s/it]



Scraping data for: https://www.calorieking.com/us/en/foods/f/calories-in-packaged-meals-homestyle-bakes-cheesy-chicken-alfredo/NN09Jb9cSemBAFpnXvJ_qw... 

Pulling data...
metrics                               item_name Alcohol Calcium   Calories  \
0        Homestyle Bakes Cheesy Chicken Alfredo     0 g  150 mg  (1714 kJ)   

metrics Cholesterol Dietary Fiber  Iron Protein Saturated Fat  Sodium  ...  \
0             35 mg           3 g  3 mg    15 g           7 g  880 mg  ...   

metrics Total Carbohydrate Total Fat Trans Fat Vitamin A Vitamin C Brand Name  \
0                     40 g      21 g       0 g    200 mg      2 mg    Banquet   

metrics       Meal Type      Serving Size Servings Per Container  \
0        Packaged Meals  serving (7.8 oz)                          

metrics                                         Source URL  
0        https://www.calorieking.com/us/en/foods/f/calo...  

[1 rows x 21 columns] 

Percentage Complete: 0.048426150121065374
Data pull complete going 

KeyboardInterrupt: 

In [None]:
fmt_df = pd.concat(formatted_list)
fmt_df.shape
fmt_df.head()

In [None]:
output_file_name = f"CalorieKing_nutrition_data_20210323_{random.randint(0,5000)}.csv"
fmt_df.to_csv(output_file_name,index=False)

# STOP HERE AND WE WILL PREPROCESS THE DATA IN THE NEXT STEP
