In [1]:
import urllib 
import lxml.html
from bs4 import BeautifulSoup 
import time
import os
import constants
import pandas as pd
import psycopg2
import re

In [8]:
def product_info(product_list):
    """
    This function scrapes the Pet Smart website to isolate information
    into separate variables from the url passed to it and exports the 
    information to a pickle file. The pickle file can be imported as a 
    dataframe, cleaned up, then stored into a database.

    Input:
    A list of urls of pet food product pages generated from crawler.py

    Output:

    nutrition_df.pkl
    name(str), productID(int), price(str), food_type(str), life_stage(str), 
    health_consideration(str), flavor(str),primary_ingredient(str), url(str)
    package_weight(str), , ingredient_list(str) and a set of columns iterated from the 
    "Guaranteed Analysis" portion of the product page, which lists things like 
    %protein and % moisture."""

    nutrition_dict_list = []
    def nutrition_regex(tag_list, nutrition_dict):

        """This function goes to the Guaranteed Analysis portion of the page
           and isolates the percentage of each nutritional component listed
           into it's appropriate key.
           
           Input: a list of sections of the webpage to search and the 
           dictionary to store the information in that is defined below.
           
           Output: Key: Value pairs listing the percentage of each 
           nutritional component. Values are percentages. These get added
           to the dictionary of product information defined in the overall
           function."""

        for tag in tag_list:
            if "Guaranteed Analysis" in tag:
                new = tag.split("<br/>")  
                for i, line in enumerate(new):
                    if "Crude Protein (min" in line:
                        nutrition_dict['crude_protein_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Crude Protein (max" in line:
                        nutrition_dict['crude_protein_max']= \
                        re.findall("\)\s*(.*?)\s*%", line)
                    elif "Crude Fat (min" in line:
                        nutrition_dict['crude_fat_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Crude Fat (max" in line:
                        nutrition_dict['crude_fat_max']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Crude Fiber (min" in line:
                        nutrition_dict['crude_fiber_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Crude Fiber (max" in line:
                        nutrition_dict['crude_fiber_max']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Moisture (min" in line:
                        nutrition_dict['moisture_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Moisture (max" in line:
                        nutrition_dict['moisture_max']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Ash (min" in line:
                        nutrition_dict['ash_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Ash (max" in line:
                        nutrition_dict['ash_max']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Taurine (min" in line:
                        nutrition_dict['taurine_min']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    elif "Taurine (max" in line:
                        nutrition_dict['taurine_max']= \
                        re.findall("\)\s*(.*?)\s*%", line) 
                    else:                        
                        nutrition_dict['tag'] = None
            else:
                None
        return nutrition_dict

    #Open product pages from the product url list created from the crawler function      
    for url in product_list: 

        nutrition_dict = {}        
        try:
        # Open each url, turn each page into a Beautiful Soup object
            r = urllib.urlopen(url)
            soup_page = BeautifulSoup(r, 'lxml') #'html.parser'
        except:
            print "Error getting page" 

        #check HTTP status codes while running (200 is good)
        print urllib.urlopen(url).getcode()

        try:        
            name_box = soup_page.find('div', attrs={'class': 'product-name'})
            prodID_box = soup_page.find('span', attrs={'class': 'productID'})        
            nutrition_dict['name'] = name_box.get_text().strip()  
            nutrition_dict['product_id'] = prodID_box.get_text()
        except:
            pass   

        try:
            price_box = soup_page.find('span', attrs={'class': 'price-regular'})
            nutrition_dict['price'] = price_box.get_text()
        except:
            nutrition_dict['price']=''
        try:
            nutrition_dict['url']=url
        except:
            nutrition_dict['url']=''            

        #This section defines the places interesting variables could be.
        #The html is messy, so I will define all the "b" tags and 
        #iterate through them below.   

        type_box = soup_page.find('div', attrs={'class': 'tab-content'})

        try:
            zero = type_box.find_all('b')[0]
        except:
            zero =  ''             
        try:
            one = type_box.find_all('b')[1]
        except:
            one = ''
        try:
            two = type_box.find_all('b')[2]
        except:
            two = ''
        try:
            three = type_box.find_all('b')[3]
        except:
            three = ''
        try:
            four = type_box.find_all('b')[4]
        except:
            four = ''
        try:
            five = type_box.find_all('b')[5]
        except:
            five = ''                      
        try:
            six = type_box.find_all('b')[6]
        except:
            six = ''
        try:
            seven = type_box.find_all('b')[7]
        except:
            seven = ''
        try:
            eight = type_box.find_all('b')[8]
        except:
            eight = ''
        try:
            nine = type_box.find_all('b')[9]
        except:
            nine = ''

        #Make a list of the variables
        info_list = [zero, one, two, three, four, five, six, 
                     seven, eight, nine] 

        #Iterate through the list to find where the information is
        #and put it into its appropriate key.
        for info in info_list:           
            if "Ingredients:" in info:  
                nutrition_dict['ingredients'] = info.next_sibling
            elif 'Food Type:' in info:
                nutrition_dict['food_type'] = info.next_sibling.strip()  
            elif 'Life Stage:' in info:
                nutrition_dict['life_stage'] = info.next_sibling.strip() 
            elif 'Health Consideration:' in info:
                nutrition_dict['health_consideration'] = \
                info.next_sibling.strip() 
            elif 'Flavor:' in info:
                nutrition_dict['flavor'] = info.next_sibling.strip() 
            elif 'Primary Ingredient:' in info:
                nutrition_dict['primary_ingredient'] = info.next_sibling
            elif 'Package Weight:' in info:
                nutrition_dict['package_weight'] = info.next_sibling.strip()
            else:
                None

        #This section defines the places "Guaranteed Analysis" could be.
        #The html is messy, so I will look in several places, and parse
        #through it using the nutrition_regex function above (called below).                    
        try:
            tag0 = str(type_box.findAll('p')[0].next_sibling).strip()
        except:
            tag0 = '' 
        try:
            tag1 = str(type_box.findAll('p')[1].next_sibling).strip()
        except:
            tag1 = ''       
        try:
            tag2 = str(type_box.findAll('p')[2].next_sibling).strip()
        except:
            tag2 = ''     
        try:
            tag3 = str(type_box.findAll('p')[3].next_sibling).strip()
        except:
            tag3 = ''
        try:
            tag4 = str(type_box.findAll('p')[4].next_sibling).strip()
        except:
            tag4 = ''

        #Make a list of the variables
        tag_list = [tag0, tag1, tag2, tag3, tag4]               

        #This calls the nutritional information parser from above,
        #feeding in the list of places to look on the page (tags)
        #for the nutritional information
        nutrition_dict = nutrition_regex(tag_list, nutrition_dict)   

        #append all keys:values to nutrition dictionary list
        nutrition_dict_list.append(nutrition_dict)    

    #turn dictionary list into a dataframes    
    nutrition_df = pd.DataFrame(nutrition_dict_list) 

    #export the dataframe from the function as a pickle file to access later
    nutrition_df.to_pickle('product_info_df.pkl')

    return nutrition_dict_list

product_info(constants.product_urls) 

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


[{'ash_max': ['3.5'],
  'crude_fat_min': ['2'],
  'crude_fiber_max': ['1.5'],
  'crude_protein_min': ['13'],
  'flavor': u'COD, Sole & Shrimp',
  'food_type': u'Wet Food',
  'health_consideration': u'General Health',
  'ingredients': u' Cod, Liver, Meat by-Products, Fish, Fish Broth, Sole, Shrimp, \r\nArtificial and Natural Flavors, Guar Gum, Added Color (Red 3 and Other Color), Calcium \r\nPhosphate, Potassium Chloride, Salt, Zinc Sulfate, Thiamine Mononitrate, Vitamin E \r\nSupplement, Ferrous Sulfate, Niacin, Manganese Sulfate, Calcium Pantothenate, Vitamin A \r\nSupplement, Copper Sulfate, Menadione Sodium Bisulfite Complex (Source of Vitamin K \r\nActivity), Pyridoxine Hydrochloride, Riboflavin Supplement, Vitamin B-12 Supplement, \r\nBiotin, Folic Acid, Vitamin D-3 Supplement, Potassium Iodide. \r\n',
  'life_stage': u'Adult 1-10 yrs',
  'moisture_max': ['78'],
  'name': u'Fancy Feast\xae Classic Adult Cat Food',
  'package_weight': u'3oz',
  'price': u'$0.59',
  'primary_ingredi

In [11]:
#import the pickle file as a dataframe:
#look in pet-products-scraper-data folder
products = pd.read_pickle('/Users/karigoodman/git-archives/pet-products-scraper-data/product_info_df.pkl')
len(products)
products.head(50)

Unnamed: 0,ash_max,crude_fat_max,crude_fat_min,crude_fiber_max,crude_fiber_min,crude_protein_max,crude_protein_min,flavor,food_type,health_consideration,...,moisture_max,name,package_weight,price,primary_ingredient,product_id,tag,taurine_max,taurine_min,url
0,[3.5],,[2],[1.5],,,[13],"COD, Sole & Shrimp",Wet Food,General Health,...,[78],Fancy Feast® Classic Adult Cat Food,3oz,$0.59,COD,1221059,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
1,[3.5],,[5],[1],,,[10],Mixed Grill,Wet Food,General Health,...,[78],Purina® Friskies® Classic Paté Cat Food,5.5oz,$0.48,Liver,1221026,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
2,[2.5],,[2.5],[1.0],,,[9.0],,,,...,[82.0],Purina® Friskies® Savory Shreds Cat Food,,$0.48,,5121705,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
3,[2.7],,[2],[1.5],,,[12],Seafood Feast,Wet Food,General Health,...,[78],Fancy Feast® Grilled Adult Cat Food,3oz,$0.59,Chicken Broth,5121709,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
4,[2.2],,[6.5],[0.8],,,[10.0],Chicken,Wet Food,,...,[78.0],Authority® Pate Adult Cat Food,6 oz.,$0.89,Chicken,5219423,,,[0.07],http://www.petsmart.com/cat/food-and-health/fo...
5,,,,,,,,Turkey,Wet Food,General Health,...,,Fancy Feast® Gravy Lovers Adult Cat Food,3oz,$0.59,Poultry Broth,5164703,,,,http://www.petsmart.com/cat/food-and-health/fo...
6,[2.8],,[2.0],[1.5],,,[10.0],Chicken & Cheese,Wet Food,General Health,...,[80.0],Purina® Pro Plan® Adult Cat Food,3oz,,Water,5178589,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
7,[2.7],,[2.0],[1.0],,,[11.0],,,,...,[80.0],Purina® Friskies® Prime Filets Cat Food,,$0.48,,5092179,,,[0.05],http://www.petsmart.com/cat/food-and-health/fo...
8,[2.3],,[5.0],[1.0],,,[10.0],Chicken & Herring,Wet Food,General Health,...,[78.0],"Wellness® Complete Health Cat Food - Natural, ...",3oz,$1.49,Chicken,5153734,,,[0.1],http://www.petsmart.com/cat/food-and-health/fo...
9,[6.5],[15.0],[12.0],[5.0],,,[33.0],,,,...,[10.0],Authority® Indoor Adult Cat Food - Chicken & Rice,,$9.99,,5220331,,,[0.15],http://www.petsmart.com/cat/food-and-health/fo...


In [39]:
#are there duplicated IDs? Yes. 
print products.duplicated('product_id').unique()

#which indices?
products[products.duplicated(['product_id'],keep=False)]

#ah, good. it's just the null values, no actual ID numbers included here. 

[False True]


Unnamed: 0,ash_max,crude_fat_max,crude_fat_min,crude_fiber_max,crude_fiber_min,crude_protein_max,crude_protein_min,flavor,food_type,health_consideration,...,moisture_max,name,package_weight,price,primary_ingredient,product_id,tag,taurine_max,taurine_min,url
50,,,,,,,,,,,...,,,,,,,,,,http://www.petsmart.com/cat/food-and-health/fo...
729,,,,,,,,,,,...,,,,,,,,,,http://www.petsmart.com/cat/food-and-health/fo...
773,,,,,,,,,,,...,,,,,,,,,,http://www.petsmart.com/cat/food-and-health/fo...


In [15]:
#to inspect the Beautiful Soup markup of a url that is a specific row of the dataframe:
link = products.iloc[18]['url']
r = urllib.urlopen(link)
soup_obj = BeautifulSoup(r, 'lxml') 
print soup_obj

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="ie6 oldie" lang="en"> <![endif]--><!--[if IE 7]> <html class="ie7 oldie" lang="en"> <![endif]--><!--[if IE 8]> <html class="ie8 oldie" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html lang="en"> <!--<![endif]-->
<head>
<script data-dtconfig="cors=1|doNotDetect=mup,kyu,scr,kyd,mdw,dcl|md=Username=a[class='user-account'],OrderConfirmation03=a.pt_order-confirmation|tp=200,50,5,0,10|fa=1|reportUrl=https://uxp.petsmartdmz.com/dynaTraceMonitor" src="http://demandware.edgesuite.net/abbb_prd/on/demandware.static/Sites-PetSmart-Site/-/default/v1487395019558/js/dtagent630_23ehjpqrtx_1305.js" type="text/javascript"></script>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="telephone=no" name="format-detection"/>
<title>

Grreat Choice® Pate Adult Cat Food | cat Canned Food | PetSmart

</title>
<link href="http://demandware.edgesuite.ne

In [12]:
#inspecting one of the columns to see how clean the scrape was. It is pretty
#good, though there are a couple instances of the wrong information. 

products.package_weight.unique()

array([u'3oz', u'5.5oz', nan, u'6 oz.', u'3 oz', u'2Lb', u'13oz',
       u'1.4 oz.', u'12Lb', u'5.5 oz', u'5Lb', u'22 lbs.', u'8 lbs.',
       u'15Lb', u'3 oz.', u'3.15Lb', u'5.5 oz.', u'4 lbs.', u'3Lb',
       u'16Lb', u'2.6 oz.', u'7Lb', u'(12) 2.75oz', u'2.8 oz.', u'6.5Lb',
       u'3.5Lb', u'11Lb', u'6Lb', u'22Lb', u'2.5 oz.', u'6.3Lb',
       u'11 lbs.', u'2.47 oz.', u'32ct', u'5 lbs.', u'14Lb', u'13.5oz',
       u'1.36 kg. (3 lbs.)', u'24ct of 5.5oz cans', u'5.8 oz.', u'15lb',
       u'4 Salmon, 3 Snapper & 3 Shrimp.', u'24ct', u'12 (3oz) Pouches',
       u'2.75 oz.', u'4.5LB', u'7 lbs.', u'12 Packs', u'24ct of 3oz',
       u'5.5 lbs.', u'12 Pack', u'2.5Lb', u'5 lb', u'11.5 lbs.',
       u'11.3 lbs.', u'6.75oz', u'13.5Lb', u'14.2Lb',
       u'(3)Tuna, (4)Tuna & Whitefish, (3)Sardine & Mackerel Entr\xc3\xa9es',
       u'5.1 lbs.', u'12ct', u'2 Lb',
       u'4 Chicken, 3 Chicken & Beef, and 3 Chicken & Duck.',
       u'5Lb\r\n14oz', u'1.98 lbs.', u'5.14Lb', u'5 oz.',
       u'12 pa