In [5]:
from bs4 import BeautifulSoup as Soup
import requests 
import pandas as pd
import numpy as np
from time import sleep

In [62]:
dataframe_columns = ['Name', 'Servings Per Container', 'Calories', 'Calories From Fat', 'Total Fat', 'Saturated Fat', 'Trans Fat', 'Polyunsaturated Fat', 'Monounsaturated Fat', 'Cholesterol', 'Sodium', 'Total Carbohydrate', 'Dietary Fiber', 'Sugars', 'Protein', 'Vitamin A', 'Vitamin C', 'Calcium', 'Iron', 'Image Link']

In [96]:
def clean_text(text):
    idx = text.lower().find('amount per serving')
    if idx != -1:
        text = text[:idx] + text[idx + len('amount per serving'):]
    text = text.replace("(-)", "0")
    text = text.replace("-", "")
    text = text.strip()
    return text

def get_safeway_data_by_pid(pid):
    response = requests.get(f"https://www.safeway.com/shop/product-details.{pid}.html")    
    return response

def get_product_data_from_html_response_approach_1(response):
    # Setup
    data = {'Success': True}
    
    # Use BeautifulSoup to parse the response
    soup = Soup(response.text)

    # Get product name
    product_name = soup.find('title')
    data['Name'] = product_name.text.replace(' - Safeway', '')

    # Find html table with info on product
    table = soup.find('table', attrs={'class':'tableOfIngredients'})
    if not table:
        data['Success'] = False
        return data
    table_body = table.find('tbody')

    # get serving info
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td', attrs={'class':'table-ingredients-head body-s body-s--semi-bold'})
        if cols:
            serving_info = clean_text(cols[0].text).split('\n')
            spc_idx = serving_info[1].find('Servings Per Container: ')
            if spc_idx == -1:
                continue
            servings_per_container = serving_info[1][spc_idx + len('Servings Per Container: '):].strip()
            data["Servings Per Container"] = servings_per_container

    # get nutritional info
    rows = table_body.find_all('tr')
    for row in rows:
        header_cols = row.find_all('th')
        data_cols = row.find_all('td', attrs={'class':'table-ingredients-text'})
        if not header_cols and not data_cols:
            continue
        if header_cols:
            data[clean_text(header_cols[0].text)] = clean_text(data_cols[-2].text)
        elif clean_text(data_cols[1].text):
            data[clean_text(data_cols[0].text)] = clean_text(data_cols[-2].text)
        # skip the things that have only % Daily Values (do not raw amounts)
    return data

def get_product_data_from_html_response_approach_2(response):
    # Setup
    data = {'Success': True}
    
    # Use BeautifulSoup to parse the response
    soup = Soup(response.text)

    # Get product name
    product_name = soup.find('title')
    data['Name'] = product_name.text.replace(' - Safeway', '')

    # Find table with info
    table = soup.find('table')
    if not table or table.find('caption').text != 'Nutrition Facts':
        data['Success'] = False
        return data
    
    # get serving info
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        if cols:
            serving_info = clean_text(cols[0].text).split('\n')
            spc_idx = serving_info[0].find('Servings Per Container')
            if spc_idx == -1:
                continue
        tds = row.find_all('td')
        if tds:
            servings_per_container = clean_text(tds[0].text)
            data["Servings Per Container"] = servings_per_container

    # get nutritional info
    rows = table.find_all('tr')
    for row in rows[:22]:
        header_cols = row.find_all('th')
        # skip if weirdly formatted 
        if not header_cols:
            continue
        # extract nutrient name
        nutrient = clean_text(header_cols[0].text)
        # Handle 'Calories' exception separately
        if nutrient.find('Calories') != -1 and 'Calories' not in data.keys():
            tds = row.find_all('td')
            calories = clean_text(tds[-1].text)
            data['Calories'] = calories
        # Handle all other nutrients
        for nutrient_name in dataframe_columns[4:-5]: 
            nutrient_name_mod = nutrient_name.lower().replace('Total', '').strip()
            if nutrient.lower().find(nutrient_name_mod) != -1:
                nutrient_value = clean_text(nutrient.lower().replace(nutrient_name_mod, ''))
                data[nutrient_name] = nutrient_value
    # handle nutrients we aren't getting from this
    for nutrient in ['Calories From Fat', 'Polyunsaturated Fat', 'Monounsaturated Fat', 'Vitamin A', 'Vitamin C', 'Calcium', 'Iron']:
        data[nutrient] = 0
    return data

def get_product_data_from_html_response(response):
    data = get_product_data_from_html_response_approach_1(response)
    # try other approach if og fails
    if data['Success'] == False:
        data = get_product_data_from_html_response_approach_2(response)
    # if still can't process, print failure message & return False 
    if data['Success'] == False:
        print('failed to find ingredients tab for: ', data['Name'])
        return False
    del data['Success']
    return data

In [99]:
# Create dataframe to store safeway data
product_data = pd.DataFrame(columns=dataframe_columns)

def run_data_processing(start=0, load_existing_data=False):
    # Load valid_pids at this moment
    valid_pids = np.load('valid_pids.npy')
    print(valid_pids, len(valid_pids))
    
    # Option to load saved df before continuing (prevent overwrite of data)
    global product_data
    if load_existing_data:
        product_data = pd.read_csv('Safeway Product Data.csv', index_col=0)
        
    # Option to skip a number of PIDs 
    i = start
    num_bad = 0
    while i < len(valid_pids):
        pid = valid_pids[i]
        # print current pid for debugging and progress info
        print(f'\r+{i-num_bad-start}/{i-start} ({pid})', flush=True, end="")
        # get safeway data
        response = get_safeway_data_by_pid(pid)
        # skip 'bad' responses
        if response.status_code != 200:
            print(f'bad response from safeway website (error code: {response.status_code})')
            i, num_bad = i + 1, num_bad + 1
            continue
        # adjust for products we've already seen
        previously_processed = (load_existing_data) and (pid in product_data.index)
        if previously_processed:
            i += 1
            continue
        data = get_product_data_from_html_response(response)
        # skip products that we can't get data from
        if not data:
#             print('no data was found from the website')
            i, num_bad = i + 1, num_bad + 1
            continue
        data['Image Link'] = f"https://images.albertsons-media.com/is/image/ABS/{pid}"
#         print(len(data.keys()), len(product_data.columns))
#         print([key for key in product_data.columns if key not in data.keys()])
        product_data.loc[pid, :] = data
        # Save dataframe as we go
        if i % 10 == 0:
            product_data.to_csv('Safeway Product Data.csv')
        # Prep for next loop
        i += 1
        # Refresh valid_pids as we go - assumes we're updating valid pids in Safeway Data Sourcing simultaenously
        while i == len(valid_pids): 
            sleep(1)
            added_pids = np.load('valid_pids.npy')
            if len(valid_pids) != len(added_pids):
                break
            else:
                i += 1
    print(f"Collected {len(product_data)} Products")
    return i

run_data_processing(start=0, load_existing_data=True)

[960088704 960094320 960138645 ... 125010049 960138612 960120462] 26600
+0/0 (960088704)failed to find ingredients tab for:  Cottonelle Flushable Wet Wipes Refills Pack - 168 Count
+0/1 (960094320)failed to find ingredients tab for:  Seafood Service Counter Fish Catfish Whole Fresh - 1.50 Lbs.
+1/3 (960064541)failed to find ingredients tab for:  Aveeno Active Naturals Pure Renewal Conditioner For All Hair Types - 10.5 Fl. Oz.
+1/4 (960103220)failed to find ingredients tab for:  Dove Advanced Care Antiperspirant Deodorant Stick Beauty Finish - 2.6 Oz
+4/8 (960110159)failed to find ingredients tab for:  Energizer MAX Batteries AA Alkaline - 16 Count
+5/10 (960286133)failed to find ingredients tab for:  Aveeno Active Naturals Body Wash Daily Moisturizing Yogurt Apricot & Honey - 18 Fl. Oz.
+8/14 (132070179)failed to find ingredients tab for:  Fancy Feast Cat Food Dry Ocean Fish & Salmon And Accents Of Garden Greens - 3 Lb
+9/16 (153200553)failed to find ingredients tab for:  Clean & Clear

+137/210 (960021635)failed to find ingredients tab for:  Beringer Main & Vine Wine Sauvignon Blanc California - 750 Ml
+137/211 (960026864)failed to find ingredients tab for:  Seafood Service Counter Crab Dungeness Whole Cooked Frozen 1 Count - 2.50 LB
+138/213 (960206577)failed to find ingredients tab for:  Cloralen Bathroom Cleaner with Bleach Fresh Scent - 22 Fl. Oz.
+142/218 (960149967)failed to find ingredients tab for:  Stuffing Mix - 20 Oz
+144/221 (960430530)failed to find ingredients tab for:  Clrx Atbc Blch/Blue F - 2 Count
+145/223 (960301684)failed to find ingredients tab for:  Maison Fortant Rose De Provence Wine - 750 Ml
+148/227 (960142756)failed to find ingredients tab for:  Seafood Service Counter Dockside Classics Crab Cake Hot & Spicy 1 Count - 2 Oz
+149/229 (960187580)failed to find ingredients tab for:  Healthy Hide Good N Fun Dog Treats Gourmet Chew Bone Triple Flavor Pouch - Each
+149/230 (960145144)failed to find ingredients tab for:  Village Candle Candle Premi

+296/446 (158010059)failed to find ingredients tab for:  Signature Care Allergy Relief Childrens Diphenhydramine HCI 12.5mg Cherry Flavor - 4 Fl. Oz.
+296/447 (960150322)failed to find ingredients tab for:  Mangosteens
+298/450 (960307810)failed to find ingredients tab for:  Open Nature Hand Soap Rosemary & Mint Scented - 12 Fl. Oz.
+303/456 (960547014)failed to find ingredients tab for:  Onions White Organic - 2 Lb
+306/460 (960497272)failed to find ingredients tab for:  Dr Scholls Duragel Corn Remover - 6 Count
+306/461 (960522121)failed to find ingredients tab for:  Alpha Foods Tamale Plant Based Chikn Verde - 5 Oz
+309/465 (960132515)failed to find ingredients tab for:  Bakery Brownie Bites Decorated 9 Count - Each
+314/471 (189054429)failed to find ingredients tab for:  Clos du Bois Sonoma Reserve Russian River Valley Wine White Chardonnay - 750 Ml
+314/472 (960492549)failed to find ingredients tab for:  Huggies Little Movers Diapers Size 5 - 19 Count
+314/473 (960495287)failed to

+424/648 (960516287)failed to find ingredients tab for:  Always Radiant Pads Size 5 Extra Heavy Overnight Absorbency Scented - 18 Count
+425/650 (960137769)failed to find ingredients tab for:  Folgers Coffee Ground Medium Roast House Blend - 24.2 Oz
+426/652 (960274656)failed to find ingredients tab for:  Ben & Jerrys Ice Cream Salted Caramel Almond 1 Pint - 16 Oz
+427/654 (970032949)failed to find ingredients tab for:  Evergreen Hand Sanitizer Aloe - 2 OZ
+428/656 (150150133)failed to find ingredients tab for:  ACT Mouthwash Anticavity Fluoride Mint - 18 Fl. Oz.
+431/660 (960327987)failed to find ingredients tab for:  Pamprin Multi Symptom Caps - 20 Count
+432/662 (970094542)failed to find ingredients tab for:  Signature Care Lens Wipes Pre Moistened - 100 CT
+432/663 (960517021)failed to find ingredients tab for:  Ocean Beauty Salmon Wild Alaskan Sockeye Smoked Apple Wood - 4 Oz
+432/664 (960537734)failed to find ingredients tab for:  Deli Catering Tray Fruit 16 Inch
+434/667 (960200

+607/906 (184560146)failed to find ingredients tab for:  Eggplant Tiger Ready Pac
+607/907 (970026569)failed to find ingredients tab for:  Signature Care Bandages Sheer Assorted - 80 CT
+607/908 (960331588)failed to find ingredients tab for:  Clock
+610/912 (189030898)failed to find ingredients tab for:  Cazadores Tequila Blanco 80 Proof - 750 Ml
+610/913 (970034205)failed to find ingredients tab for:  Clorox Splash-less Liquid Bleach Clean Linen - 40 OZ
+612/916 (960533901)failed to find ingredients tab for:  Al Safa Hafal Chicken Butter With Rice
+615/920 (960050354)failed to find ingredients tab for:  Olay Cleanse Facial Cloths Gentle Fragrance Free - 30 Count
+616/922 (960143799)failed to find ingredients tab for:  Wet N Wild Max Vol Plu Mascara Ampd Blk .27 Oz
+624/931 (173250166)failed to find ingredients tab for:  S.O.S Soap Pads Steel Wool Reusable Soap Filled - 10 Count
+626/934 (960523918)failed to find ingredients tab for:  Airwick Scented Oil Starter Kit Apple Cinnamon Medl

+774/1148 (960531473)failed to find ingredients tab for:  Crest 3D White Toothpaste Fluoride Anticavity Whitening Arctic Fresh - 3 Oz
+782/1157 (960057483)failed to find ingredients tab for:  Signature Care Famotidine Acid Reducer Controller Tablets - 30 Count
+782/1158 (960100191)failed to find ingredients tab for:  Hahn Santa Lucia Highlands Estate Pinot Noir Wine - 750 Ml
+784/1161 (960537604)failed to find ingredients tab for:  Fresh Step Cat Litter Clumping Extreme Mediterranean Lavender - 25 Lb
+791/1169 (960136878)failed to find ingredients tab for:  Hillshire Farm Hardwood Smoked Summer Sausage - 20 Oz
+791/1170 (960537717)failed to find ingredients tab for:  Deli Catering Tray Sandwich Croissant 16 Inch
+791/1171 (186350231)failed to find ingredients tab for:  Seafood Counter Fish Trout Rainbow Dressed Fresh - 1.00 LB
+791/1172 (150150127)failed to find ingredients tab for:  LISTERINE Total Care Mouthwash Anticavity Fresh Mint - 1 Liter
+791/1173 (137700509)failed to find ingr

+915/1362 (162100081)failed to find ingredients tab for:  BAND-AID Brand Adhesive Bandages Plus Antibiotic Assorted Sizes - 20 Count
+915/1363 (960337099)failed to find ingredients tab for:  Signature Care Lotion Anti Itch Hydrocortisone 1% Eczyma Itchy Dry Skin - 3.5 Oz
+915/1364 (960084782)failed to find ingredients tab for:  Old Spice High Endurance Anti Perspirant & Deodorant For Men Fresh Scent - 2-3 Oz
+915/1365 (960552442)failed to find ingredients tab for:  Degree Womens Anti-Perspirant Passion - 3.8 Fl. Oz.
+915/1366 (960191582)failed to find ingredients tab for:  Melissas Onions Boiler White - 7 Oz
+918/1370 (960102553)failed to find ingredients tab for:  Motrin Pain Reliever Fever Reducer Ibuprofen Tablets Usp 200 Mg - 225 Count
+918/1371 (960050429)failed to find ingredients tab for:  Cascade Complete Dishwasher Detergent Gel With Dawn Fresh Scent - 75 Oz
+921/1375 (960513647)failed to find ingredients tab for:  Huggies Overnites Baby Diapers Nighttime Size 6 - 15 Count
+92

+1097/1616 (164250275)failed to find ingredients tab for:  Monistat Vaginal Antifungal 7-Day Treatment Cream Simple Cure 7 Count - 1.59 Oz
+1098/1618 (960318702)failed to find ingredients tab for:  Colgate Optic White - 32 Fl. Oz.
+1099/1620 (960096113)failed to find ingredients tab for:  Olde Thompson Black Pepper Malabar - 6 Oz
+1108/1630 (960275957)failed to find ingredients tab for:  Dove Body Wash Instant Foaming Deep Moisture - 13.5 Fl. Oz.
+1108/1631 (960493357)failed to find ingredients tab for:  Air Wick Pure Beach Escapes Scented Oil Refills Florida Keys Coconut Water - 2-0.67 Fl. Oz.
+1114/1638 (970020102)failed to find ingredients tab for:  Cake Fudge 8in Sl - EA
+1115/1640 (189054763)failed to find ingredients tab for:  Markham Napa Chardonnay Wine - 750 Ml
+1116/1642 (131200133)failed to find ingredients tab for:  Downy Ultra Fabric Conditioner Liquid Clean Breeze 120 Loads - 103 Fl. Oz.
+1117/1644 (970022181)failed to find ingredients tab for:  Johnsonville Mild Italian 

+1221/1813 (960088216)failed to find ingredients tab for:  BAND-AID Brand Adhesive Bandages Tough Strips All One Size - 60 Count
+1221/1814 (960322014)failed to find ingredients tab for:  Sally Sh Insta Dri Clearly Quick - .14 Fl. Oz.
+1221/1815 (960057250)failed to find ingredients tab for:  Crest 3D White Mouthwash Multi Care Whitening Glamorous White Arctic Mint - 32 Fl. Oz.
+1222/1817 (960537715)failed to find ingredients tab for:  Deli Catering Tray Nibbler Fruit & Cheese 18 Inch
+1225/1821 (960325792)failed to find ingredients tab for:  Healthy Hide Dog Treat Good n Fun Triple Flavor Kabobs Bag 18 Count - 12 Oz
+1226/1823 (960548255)failed to find ingredients tab for:  Softsoap Body Wash Moisturizing Sweet Honeysuckle & Orange - 20 Fl. Oz.
+1227/1825 (960281693)failed to find ingredients tab for:  ARM & HAMMER Spinbrush Toothbrush Kids Powered Soft Nickelodeon Paw Patrol - Each
+1233/1832 (960333898)failed to find ingredients tab for:  Goody Hairbrush All Purpose Styling - 2 Coun

+1398/2062 (970064546)failed to find ingredients tab for:  Fever Tree Ginger Beer Cans - 40.56 FZ
+1398/2063 (960002432)failed to find ingredients tab for:  Tampax Tampons Cardboard Applicator Super Absorbency Unscented - 40 Count
+1401/2067 (960019946)failed to find ingredients tab for:  AXE Daily Fragrance Dark Temptation - 4 Oz
+1401/2068 (960048661)failed to find ingredients tab for:  Dingo Rawhide Chew Chicken Based Jumbo 2 Count - 9 Oz
+1406/2074 (189056969)failed to find ingredients tab for:  Opolo Vineyards Wine Summit Creek Zinfandel - 750 Ml
+1406/2075 (960197701)failed to find ingredients tab for:  Signature Care Bandage Roll Flexible 4.5in x 4.1yd - Each
+1414/2084 (970031958)failed to find ingredients tab for:  Elysian Full Contact Hazy Ipa Cns - 6-12 FZ
+1418/2089 (960298174)failed to find ingredients tab for:  New Amsterdam Vodka Raspberry Flavored 70 Proof - 750 Ml
+1418/2090 (960053264)failed to find ingredients tab for:  Curad Alcohol Swabs Value Pack - 200 Count
+142

+1573/2309 (132010116)failed to find ingredients tab for:  PEDIGREE Dog Food Ground Dinner Traditional Chicken & Rice Dinner Can - 13.2 Oz
+1574/2311 (960291742)failed to find ingredients tab for:  Betadine First Aid Solution - 8 Fl. Oz.
+1575/2313 (960522195)failed to find ingredients tab for:  Liberated Bread Cashew Creme - 17.6 Oz
+1578/2317 (970022408)failed to find ingredients tab for:  Pie Key Lime Half - EA
+1583/2323 (960300852)failed to find ingredients tab for:  Primavera Non Dairy Green Chile Tamales - 4 Count
+1585/2326 (289012847)failed to find ingredients tab for:  Boddingtons English Ale Cans - 4-16 Fl. Oz.
+1585/2327 (960314459)failed to find ingredients tab for:  Biomiracle Charcoal Peel Off - 3.5 Z
+1585/2328 (960022186)failed to find ingredients tab for:  Tecnu Extreme Poison Ivy Scrub Medicated - 4 Oz
+1586/2330 (960173255)failed to find ingredients tab for:  LISTERINE Ultraclean Mouthwash Antiseptic Cool Mint - 500 Ml
+1587/2332 (970023322)failed to find ingredient

+1729/2538 (960210751)failed to find ingredients tab for:  OxiClean Laundry Stain Remover White Revive - 50 Fl. Oz.
+1736/2546 (970002096)

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [49]:
product_data

Unnamed: 0.1,Unnamed: 0,Name,Servings Per Container,Calories,Calories From Fat,Total Fat,Saturated Fat,Trans Fat,Polyunsaturated Fat,Monounsaturated Fat,...,Sodium,Total Carbohydrate,Dietary Fiber,Sugars,Protein,Vitamin A,Vitamin C,Calcium,Iron,Image Link
0,960035113.0,waterfront BISTRO Swai Fillets Boneless & Skin...,8,70,0,1g,0g,0g,0,0,...,480mg,0g,0g,0,14g,,,,,https://images.albertsons-media.com/is/image/A...
1,108050410.0,Schweppes Soda Ginger Ale - 6-10 Fl. Oz.,,100,0,0G,0G,0G,0,0,...,50MG,28G,0G,27G,0G,,,,,https://images.albertsons-media.com/is/image/A...
2,137150062.0,Lucerne Cheese Crumbled Blue - 4 Oz,about 4,110,70,8g,6g,0g,0,0,...,300mg,1g,0g,<1g,6g,,,,,https://images.albertsons-media.com/is/image/A...
3,107100154.0,Mezzetta Peperoncini Greek Golden - 32 Oz,about 14,10,1,0g,0g,0g,0,0,...,390mg,1g,0g,0g,0g,,,,,https://images.albertsons-media.com/is/image/A...
4,117350051.0,Splenda Sweetener No Calories Taste Like Sugar...,100,0,0,0g,0,0,0,0,...,0mg,<1g,0,0,0g,,,,,https://images.albertsons-media.com/is/image/A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984,960163631.0,IAMS Proactive Health Cat Food Indoor Weight &...,,0,0,0,0,0,0,0,...,0,0,0,0,,,,,,https://images.albertsons-media.com/is/image/A...
2985,960477621.0,Sukhis Chicken Coconut Curry Entree - 16 Oz,About 3,180,0,6g,2g,0g,0,0,...,600mg,11g,0g,0,22g,,,,,https://images.albertsons-media.com/is/image/A...
2986,188580048.0,Oscar Mayer Ham Smoked - 16 Oz,about 7,70,20,2G,0.5G,0G,0,0,...,610MG,0G,0,0,12G,,,,,https://images.albertsons-media.com/is/image/A...
2987,960021901.0,Valley Fresh Chicken Breast 100% Natural with ...,5,45,10,1g,0g,0g,0,0,...,260mg,0g,0g,0g,9g,,,,,https://images.albertsons-media.com/is/image/A...
