In [1]:
import urllib 
import lxml.html
from bs4 import BeautifulSoup 
import time
import os
import constants
import pandas as pd
import psycopg2
import re

#Define DB and User names, imported from constants file
dbname = constants.dbname
username = constants.username

In [None]:
dbname

In [None]:
def scrape_product_info(product_list):
    
    '''
    This function scrapes the Pet Smart website to isolate information
    into separate variables from the url passed to it and exports the 
    information to a pickle file. Because the html markers are a bit 
    unstable, the data needs to be cleaned up before putting it into 
    a database. The pickle file can be imported as a dataframe,
    cleaned up, then stored into a database.
       
    Input:
    A list of urls of pet food product pages generated from crawler.py
       
    Output:
    
    product_df.pkl
    name(str), productID(int), price(str), food_type(str), life_stage(str), 
    health_consideration(str), flavor(str),primary_ingredient(str), url(str)
    package_weight(str) 
    
    nutrition_df.pkl
    name(str), productID(int), url(str), a set of columns iterated from the 
    "Guaranteed Analysis" portion of the product page, which lists things like 
    %protein and % moisture.
    
    ingredient_df.pkl
    name(str), productID(int), url(str), ingredient_list(str) and ingredient_list2(str)
    
    Error tracking: returns 3 lists of urls that indicate which urls failed 
    while accessing the website or while scraping the ingredients or 
    nutritional analysis information (the most unstable).
    '''
    
    #Error tracking
    failed_url = []
    #ingredients_url = []
    #nutrition_url = []
    
    #Data
    dictionary_list = []
    #nutrition_dict_list = []
    #ingredient_dict_list = []
    
    #Open product pages from the product url list created from the  crawler function 
    for url in product_list:
        dict = {}  
        #ingredient_dict = {}
        #nutrition_dict = {}
        
        try:     
            # Open each url, turn each page into a Beautiful Soup object
            r = urllib.urlopen(url)
            soup_page = BeautifulSoup(r, 'lxml') #'html.parser'
        except:
            print "Error getting page" 
            failed_url.append(url)
        
        #check HTTP status codes while running (200 is good)
        print urllib.urlopen(url).getcode()
            
        #Isolate each variable of interest from the web page using the Beautiful
        #Soup tags, put them in the dictionary.         
        try:           
            name_box = soup_page.find('div', attrs={'class': 'product-name'})
            dict['name'] = name_box.get_text().strip()             
        except:
            dict['name'] = '' 
        try:
            prodID_box = soup_page.find('span', attrs={'class': 'productID'})       
            dict['product_id'] = prodID_box.get_text()   
        except:
            dict['product_id'] =''
        try:
            price_box = soup_page.find('span', attrs={'class': 'price-regular'})
            dict['price'] = price_box.get_text()
        except:
            dict['price']=''
        try:
            type_box = soup_page.find('div', attrs={'class': 'tab-content'})

        #To do here: edit this so it iterates through a list rather than 
        #writing out all of them.
            dict['food_type'] = type_box.find_all('b')[0].next_sibling 
        except:
            dict['food_type']=''
        try:
            dict['life_stage'] = type_box.find_all('b')[1].next_sibling
        except:
            dict['life_stage'] = ''
        try:
            dict['health_consideration'] = type_box.find_all('b')[2].next_sibling
        except:
            dict['health_consideration'] = ''
        try:
            dict['flavor'] = type_box.find_all('b')[3].next_sibling
        except:
            dict['flavor'] = ''
        try:
            dict['primary_ingredient'] = type_box.find_all('b')[4].next_sibling
        except:
            dict['primary_ingredient'] = ''
        try:
            dict['package_weight'] = type_box.find_all('b')[5].next_sibling.strip()
        except:
            dict['package_weight'] = ''                      

        try:
            dict['url']=url
        except:
            dict['url']=''
            
        #append all keys:values to dictionary list
        dictionary_list.append(dict)
                
#     ###put ingredients into a separate df to be parsed later
       
#         try:        
#             name_box = soup_page.find('div', attrs={'class': 'product-name'})
#             prodID_box = soup_page.find('span', attrs={'class': 'productID'})        
#             ingredient_dict['name'] = name_box.get_text().strip()  
#             ingredient_dict['product_id'] = prodID_box.get_text()
#             ingredient_dict['url']=url
#         except:
#             pass               

#         try:
#             type_box = soup_page.find('div', attrs={'class': 'tab-content'})
#             ingredient_dict['ingredients'] = type_box.find_all('b')[6].next_sibling                    
#         except:
#             ingredient_dict['ingredients'] = ''
#             ingredients_url.append(url)
#             pass
#         try:
#             type_box = soup_page.find('div', attrs={'class': 'tab-content'})
#             ingredient_dict['ingredients2'] = type_box.find_all('b')[7].next_sibling                    
#         except (IndexError, TypeError):
#             pass
        
#         ingredient_dict_list.append(ingredient_dict) 
        
#     #put all nutrition info into its own dataframe. 
    
#         try:        
#             name_box = soup_page.find('div', attrs={'class': 'product-name'})
#             prodID_box = soup_page.find('span', attrs={'class': 'productID'})        
#             nutrition_dict['name'] = name_box.get_text().strip()  
#             nutrition_dict['product_id'] = prodID_box.get_text()
#             nutrition_dict['url']=url
#         except:
#             pass               

#         try:  
#             type_box = soup_page.find('div', attrs={'class': 'tab-content'})
#             tag = type_box.findAll('p')[1].next_sibling
#             tag_s = str(tag)
#             new = tag_s.split("<br/>")
#             for i, line in enumerate(new):
#                 nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
#         except:
#             nutrition_url.append(url)
                        
#         try:
#             type_box = soup_page.find('div', attrs={'class': 'tab-content'})
#             tag2 = type_box.findAll('p')[2].next_sibling
#             tag_s2 = str(tag2)
#             new2 = tag_s2.split("<br/>")
#             for i, line in enumerate(new2):
#                 nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line)             
#         except (IndexError, TypeError):
#             pass
        
#         #append all keys:values to nutrition dictionary list
#         nutrition_dict_list.append(nutrition_dict)   
    
    #turn dictionary lists into a dataframes    
    df = pd.DataFrame(dictionary_list) 
    #nutrition_df = pd.DataFrame(nutrition_dict_list) 
    #ingredient_df = pd.DataFrame(ingredient_dict_list)

    #export the dataframe from the function as a pickle file to access later
    df.to_pickle('product_df.pkl')
    #nutrition_df.to_pickle('nutrition_df.pkl')
    #ingredient_df.to_pickle('ingredient_df.pkl')
                  
    return failed_url, dictionary_list

scrape_product_info(constants.product_urls)  

In [None]:
nutrition = pd.read_pickle('nutrition_df.pkl')
len(nutrition)

In [None]:
ingredients = pd.read_pickle('ingredient_df_test.pkl')
ingredients.head(15)

In [2]:
def nut_test_function(product_list):
    
    #Error tracking
    failed_url = []
    nutrition_url = []

    #Data
    nutrition_dict_list = []
    
    #Open product pages from the product url list created from the  crawler function 
    for url in product_list: 
    
        nutrition_dict = {}
        
        try:
        # Open each url, turn each page into a Beautiful Soup object
            r = urllib.urlopen(url)
            soup_page = BeautifulSoup(r, 'lxml') #'html.parser'
        except:
            print "Error getting page" 
            failed_url.append(url)
        
            #check HTTP status codes while running (200 is good)
        print urllib.urlopen(url).getcode()
 
         type_box = soup_page.find('div', attrs={'class': 'tab-content'})
        
        try:
            tag = str(type_box.findAll('p')[1].next_sibling)
        except:
            tag = ''
        
        try:
            tag2 = str(type_box.findAll('p')[2].next_sibling)
        except:
            tag2=''
       
        try:
            tag3 = str(type_box.findAll('p')[3].next_sibling)
        except:
            tag3 = ''
        
        if "Guaranteed Analysis" in tag:
            new = tag.split("<br/>")           
            for i, line in enumerate(new):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
 
        elif "Guaranteed Analysis" in tag2:
            new2 = tag2.split("<br/>")            
            for i, line in enumerate(new2):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
                
        elif "Guaranteed Analysis" in tag3:
            new3 = tag3.split("<br/>")            
            for i, line in enumerate(new3):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
        
        else:
            nutrition_dict = ''
            nutrition_url.append(url)
        
        #append all keys:values to nutrition dictionary list
        nutrition_dict_list.append(nutrition_dict)  
        
    #turn dictionary list into a dataframes    
    nutrition_df = pd.DataFrame(nutrition_dict_list) 

    #export the dataframe from the function as a pickle file to access later
    nutrition_df.to_pickle('nutrition_df.pkl')
                  
    return failed_url, nutrition_url, nutrition_dict_list

nut_test_function(constants.product_urls)  

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


([],
 ['http://www.petsmart.com/cat/food-and-health/food/purina-friskies-indoor-cat-food-1517.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/iams-purrfect-delight-cat-food-20920.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/wellness-complete-health-cat-food---natural-grain-free-25941.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/wellness-signature-selects-cat-food---natural-grain-free-12914.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/blue-healthy-gourmet-flaked-adult-cat-food-22163.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/iams-purrfect-delicacies-cat-food-20922.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/royal-canin-feline-health-nutritionandtrade-instinctive-kitten-food-21021.html?cgid=200004',
  'http://www.petsmart.com/cat/food-and-health/food/blue-healthy-gourmet-adult-cat-food---indoor-chicken-entree-pate-36987.html?cgid=2000

In [19]:
nutrition = pd.read_pickle('nutrition_df.pkl')
nutrition.head(200)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,name,product_id,url
0,[],,,,,,,,,,...,,,,,,,,Fancy Feast® Classic Adult Cat Food,1221059,http://www.petsmart.com/cat/food-and-health/fo...
1,[],[10],[5],[1],[78],[3.5],[0.05],,,,...,,,,,,,,Purina® Friskies® Classic Paté Cat Food,1221026,http://www.petsmart.com/cat/food-and-health/fo...
2,[],[],[],,,,,,,,...,,,,,,,,Purina® Friskies® Savory Shreds Cat Food,5121705,http://www.petsmart.com/cat/food-and-health/fo...
3,[],[12],[2],[1.5],[78],[2.7],[0.05],,,,...,,,,,,,,Fancy Feast® Grilled Adult Cat Food,5121709,http://www.petsmart.com/cat/food-and-health/fo...
4,[],[10.0],[6.5],[0.8],[78.0],[2.2],[0.025],[0.07],[0.5],[0.15],...,,,,,,,,Authority® Pate Adult Cat Food,5219423,http://www.petsmart.com/cat/food-and-health/fo...
5,[],[9],[2],[1.5],[82],[3],[0.05],,,,...,,,,,,,,Fancy Feast® Gravy Lovers Adult Cat Food,5164703,http://www.petsmart.com/cat/food-and-health/fo...
6,[],,,,,,,,,,...,,,,,,,,Purina® Pro Plan® Adult Cat Food,5178589,http://www.petsmart.com/cat/food-and-health/fo...
7,[],[],[],,,,,,,,...,,,,,,,,Purina® Friskies® Prime Filets Cat Food,5092179,http://www.petsmart.com/cat/food-and-health/fo...
8,[],,,,,,,,,,...,,,,,,,,"Wellness® Complete Health Cat Food - Natural, ...",5153734,http://www.petsmart.com/cat/food-and-health/fo...
9,[],[],[],,,,,,,,...,,,,,,,,Authority® Indoor Adult Cat Food - Chicken & Rice,5220331,http://www.petsmart.com/cat/food-and-health/fo...


In [31]:
failedLink = nutrition.iloc[1]['url']
print failedLink

http://www.petsmart.com/cat/food-and-health/food/purina-friskies-classic-pate-cat-food-1610.html?cgid=200004


In [20]:
r = urllib.urlopen(failedLink)
soup_obj = BeautifulSoup(r, 'lxml') #'html.parser'
#print soup_obj

In [None]:
#working out an if/then series to accomodate the crappy html coding
def nutrition_info(product_list):

    #Error tracking
    failed_url = []
    nutrition_url = []

    #Data
    nutrition_dict_list = []    
    
    #Open product pages from the product url list created from the  crawler function 
    for url in product_list: 
    
        nutrition_dict = {}
        
        try:
        # Open each url, turn each page into a Beautiful Soup object
            r = urllib.urlopen(url)
            soup_page = BeautifulSoup(r, 'lxml') #'html.parser'
        except:
            print "Error getting page" 
            #failed_url.append(url)
        
            #check HTTP status codes while running (200 is good)
        print urllib.urlopen(url).getcode()

        try:        
            name_box = soup_page.find('div', attrs={'class': 'product-name'})
            prodID_box = soup_page.find('span', attrs={'class': 'productID'})        
            nutrition_dict['name'] = name_box.get_text().strip()  
            nutrition_dict['product_id'] = prodID_box.get_text()
            nutrition_dict['url']=url
        except:
            pass                      
        
        type_box = soup_page.find('div', attrs={'class': 'tab-content'})
        
        try:
            nutrition_dict['food_type']= type_box.find_all('b')[0].next_sibling.strip() 
        except:
            nutrition_dict['food_type']=''
        
        try:
            tag = str(type_box.findAll('p')[1].next_sibling)
        except:
            tag = ''
        
        try:
            tag2 = str(type_box.findAll('p')[2].next_sibling)
        except:
            tag2=''
       
        try:
            tag3 = str(type_box.findAll('p')[3].next_sibling)
        except:
            tag3 = ''
        
        if "Guaranteed Analysis" in tag:
            new = tag.split("<br/>")           
            for i, line in enumerate(new):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
 
        elif "Guaranteed Analysis" in tag2:
            new2 = tag2.split("<br/>")            
            for i, line in enumerate(new2):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
                
        elif "Guaranteed Analysis" in tag3:
            new3 = tag3.split("<br/>")            
            for i, line in enumerate(new3):
                nutrition_dict[i]= re.findall("\)\s*(.*?)\s*%", line) 
        
        else:
            nutrition_dict = ''
            nutrition_url.append(url)
            
        #append all keys:values to nutrition dictionary list
        nutrition_dict_list.append(nutrition_dict)    
        
    #turn dictionary list into a dataframes    
    nutrition_df = pd.DataFrame(nutrition_dict_list) 

    #export the dataframe from the function as a pickle file to access later
    nutrition_df.to_pickle('nutrition_df.pkl')
                  
    return failed_url, nutrition_url, nutrition_dict_list

#link = 'http://www.petsmart.com/cat/food-and-health/food/purina-friskies-classic-pate-cat-food-1610.html?cgid=200004'
nutrition_info(constants.product_urls)  

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [52]:
nutrition_test = pd.read_pickle('nutrition_df_test.pkl')
nutrition_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,food_type,name,product_id,url
0,[],[13],[2],[1.5],[78],[3.5],[0.05],,Wet Food,Fancy Feast® Classic Adult Cat Food,1221059,http://www.petsmart.com/cat/food-and-health/fo...
1,[],[10],[5],[1],[78],[3.5],[0.05],,Wet Food,Purina® Friskies® Classic Paté Cat Food,1221026,http://www.petsmart.com/cat/food-and-health/fo...
2,[],[9.0],[2.5],[1.0],[82.0],[2.5],[0.05],[],Shreds in Gravy,Purina® Friskies® Savory Shreds Cat Food,5121705,http://www.petsmart.com/cat/food-and-health/fo...
3,[],[12],[2],[1.5],[78],[2.7],[0.05],,Wet Food,Fancy Feast® Grilled Adult Cat Food,5121709,http://www.petsmart.com/cat/food-and-health/fo...
4,[],[9],[2],[1.5],[82],[3],[0.05],,Wet Food,Fancy Feast® Gravy Lovers Adult Cat Food,5164703,http://www.petsmart.com/cat/food-and-health/fo...


In [None]:
def ingredient_test_function(product_list):
 
    #Error tracking
    failed_url = []
    ingredients_url = []

    #Data
    ingredient_dict_list = []
    
    #Open product pages from the product url list created from the  crawler function 
    for url in product_list: 
    
        ingredient_dict = {}
        
        try:
        # Open each url, turn each page into a Beautiful Soup object
            r = urllib.urlopen(url)
            soup_page = BeautifulSoup(r, 'lxml') #'html.parser'
        except:
            print "Error getting page" 
            failed_url.append(url)
        
        #check HTTP status codes while running (200 is good)
        print urllib.urlopen(url).getcode()
 
        try:        
            name_box = soup_page.find('div', attrs={'class': 'product-name'})
            prodID_box = soup_page.find('span', attrs={'class': 'productID'})        
            ingredient_dict['name'] = name_box.get_text().strip()  
            ingredient_dict['product_id'] = prodID_box.get_text()
            ingredient_dict['url']=url
        except:
            pass               

        try:
            type_box = soup_page.find('div', attrs={'class': 'tab-content'})
            ingredient_dict['ingredients'] = type_box.find_all('b')[6].next_sibling                    
        except: #(IndexError, TypeError)
            ingredient_dict['ingredients'] = ''
        try:
            type_box = soup_page.find('div', attrs={'class': 'tab-content'})
            ingredient_dict['ingredients2'] = type_box.find_all('b')[7].next_sibling                    
        except: # (IndexError, TypeError):
            ingredient_dict['ingredients2'] = ''
        
        ingredient_dict_list.append(ingredient_dict)
        
    #turn dictionary list into a dataframe    
    ingredient_df = pd.DataFrame(ingredient_dict_list)

    #export the dataframe from the function as a pickle file to access later
    ingredient_df.to_pickle('ingredient_df.pkl')
                  
    return failed_url, ingredients_url, ingredient_dict_list

ingredient_test_function(constants.product_urls)  

In [None]:
ingredients = pd.read_pickle('ingredient_df.pkl')
ingredients.head(15)

In [None]:
#foreign key. separate db for all possible ingredients. 

ingredients.head(50)
ingredients.iloc[2]['url']
#ingredients. split on , and space. stem and tokenize. then
#do a separate db of ingredients. one to many? many to many? see andy's for example.

In [None]:
#Connect to the Postgres database:
con = None
con = psycopg2.connect(database = dbname, user = username)

# query:
sql_query = """
SELECT * FROM test_cat_food;
"""
pet_food_from_sql = pd.read_sql_query(sql_query,con)
pet_food_from_sql[-5:]
# WHERE product_id=5121705; 
# WHERE product_id=1221059 = 35???
#len(pet_food_from_sql)

In [None]:
failed_link ='http://www.petsmart.com/cat/food-and-health/food/purina-friskies-savory-shreds-cat-food-1547.html?cgid=200004'

In [None]:
new_dict = {}
type_box2 = soup_obj.find('div', attrs={'class': 'tab-content'})
tag = type_box2.findAll('p')

print tag       

In [None]:
import re

new_dict = {}
type_box2 = soup_obj.find('div', attrs={'class': 'tab-content'})
tag = type_box2.findAll('p')[1].next_sibling
tag_s = str(tag)
new = tag_s.split("<br/>")
for i, line in enumerate(new):
    new_dict[i]= re.findall("\)\s*(.*?)\s*%", line)             #("\)(.*?)%", line)
print new_dict

#['<p><b>Guaranteed Analysis:</b>\n', 'Crude Protein (Min) 9%\r\n', 'Crude Fat (Min)\t2%\r\n', 'Crude Fiber (Max) 1.5%\r\n', 'Moisture (Max) 82%\r\n', 'Ash (Max) 3%\r\n', 'Taurine (Min) 0.05%\r\n\r\n\t\t\t\t\t</p>']

In [None]:
r = urllib.urlopen(failed_link)
soup_obj = BeautifulSoup(r, 'lxml') #'html.parser'
print soup_obj