In [1]:
import time
import requests
import re
from random import shuffle


from bs4 import BeautifulSoup
from fake_useragent import UserAgent

import pandas as pd
import numpy as np

def get_proxies(link):  
    response = requests.get(link)
    soup = BeautifulSoup(response.text,"lxml")
    https_proxies = filter(lambda item: "yes" in item.text,
                           soup.select("table.table tr"))
    for item in https_proxies:
        yield "{}:{}".format(item.select_one("td").text,
                             item.select_one("td:nth-of-type(2)").text)

def get_random_proxies_iter():
    proxies = list(get_proxies('https://www.sslproxies.org/'))
    shuffle(proxies)
    return iter(proxies)  # iter so we can call next on it to get the next proxy


def get_proxy(session, proxies, validated=False):
    session.proxies = {'https': 'https://{}'.format(next(proxies))}
    if validated:
        while True:
            try:
                return session.get('https://httpbin.org/ip').json()
            except Exception:
                session.proxies = {'https': 'https://{}'.format(next(proxies))}


def get_response(url):
    session = requests.Session()
    ua = UserAgent()
    proxies = get_random_proxies_iter()
    while True:
        try:
            session.headers = {'User-Agent': ua.random}
            print(get_proxy(session, proxies, validated=True))  #collect a working proxy to be used to fetch a valid response
            return session.get(url) # as soon as it fetches a valid response, it will break out of the while loop
        except StopIteration:
            raise  # No more proxies left to try
        except Exception:
            pass  # Other errors: try again

In [6]:
def get_random_ua():
    random_ua = ''
    ua_file = 'user-agents.txt'
    try:
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_proxy = lines[int(idx)]
    except Exception as ex:
        print('Exception in random_ua')
        print(str(ex))
    finally:
        return random_ua

def get_random_proxy():
    random_pr = ''
    pr_file = 'proxy-list-raw.txt'
    try:
        with open(pr_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_proxy = lines[int(idx)]
    except Exception as ex:
        print('Exception in random_pr')
        print(str(ex))
    finally:
        return random_pr

In [7]:
###FUNCIÓN SPIDER
def spider_multipage(pages_to_scrape, parsing_func, sleep_interval):
    container = pd.DataFrame()
    user_agent = get_random_ua()
    proxy = get_random_proxy()
    headers = {'user-agent': user_agent, 
               'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
              'accept-encoding': 'gzip, deflate, br',
               'accept-language': 'es-ES,es;q=0.9,en;q=0.8,pt;q=0.7',
               'cache-control': 'max-age=0'}
    if type(pages_to_scrape) == int:
        for i in range(1, pages_to_scrape+1):
            response = requests.get(f'https://www.amazon.com.mx/s?i=grocery&rh=n%3A17724602011&page={i}&__mk_es_MX=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1567531660&ref=sr_pg_{i}', 
                                    headers=headers, 
                                    proxies=proxy)
            container = container.append(parsing_func(response.content), ignore_index=True)
            if sleep_interval > 0:
                time.sleep(sleep_interval)
    else:
        for el in pages_to_scrape:
            response = requests.get(f'https://www.amazon.com.mx{el}', 
                                    headers=headers, 
                                    proxies=proxy)
            container = container.append(parsing_func(response.content), ignore_index=True)
            if sleep_interval > 0:
                time.sleep(sleep_interval)
    return pd.DataFrame(container)

In [8]:
#FUNCIÓN PARA TOMAR TODOS LOS LINKS DE PÁGINA DE TODOS LOS PRODUCTOS
def prodlist_parser(content):
    page_soup = BeautifulSoup(content, 'html')
    names_raw = page_soup.select('.a-size-base-plus')
    names = pd.Series([name.text for name in names_raw])
    links_raw = page_soup.select('h2 > a.a-link-normal')
    links = pd.Series([link.attrs['href'] for link in links_raw])
    df = pd.concat([names, links], axis=1)
    return df

In [9]:
PAGES_TO_SCRAPE = 2 #115
SLEEP = 5

productlist = spider_multipage(PAGES_TO_SCRAPE, prodlist_parser, SLEEP)
productlist

Unnamed: 0,0,1
0,"Nescafé Taster's Choice Café Soluble Orgánico,...",/Nescaf%C3%A9-Tasters-Choice-Soluble-Org%C3%A1...
1,Café de La Parroquia de Veracruz Bolsa de Café...,/Caf%C3%A9-Parroquia-Veracruz-Bolsa-Molido/dp/...
2,"Nescafe Cafe Olla, 170 g",/Nescafe-Cafe-Olla-Caf%C3%A9-gramos/dp/B078P16...
3,"Café Solo Dios Café en Grano, Sabor Artesanal,...",/Caf%C3%A9-Solo-Dios-Grano-Artesanal/dp/B07FPK...
4,"Nescafe, Café soluble, 120 gramos",/Nescafe-Caf%C3%A9-soluble-120-gramos/dp/B07QX...
5,K Cup Bulk Kirkland Pacific Bold 30 Pack,/Bulk-Kirkland-Pacific-Bold-Pack/dp/B07FNDDP4Q...
6,Café de La Parroquia de Veracruz Café Puro Sol...,/Caf%C3%A9-Parroquia-Veracruz-Puro-Soluble/dp/...
7,Los Portales de Cordoba Café Super Premium Gol...,/Los-Portales-Cordoba-Premium-Liofilizado/dp/B...
8,"Nescafé Café de Olla, 46 g",/Nescaf%C3%A9-Caf%C3%A9-Olla-46-g/dp/B07R9R767...
9,"Café Solo Dios Café en molido de 1kg, SABOR AR...",/Caf%C3%A9-Solo-Dios-Molido-Artesanal/dp/B07FP...


In [23]:
def select_text(soup, lista):
    return [el.text for item in lista for el in soup.select(item)]

def prodpage_parser(content):
    page_soup = BeautifulSoup(content, 'html')
    
    titles_raw = page_soup.select('#productTitle')
    titles = pd.Series([title.text for title in titles_raw])
    
    #AGREGAR POR
    
    prices_raw = page_soup.select('span#priceblock_ourprice')
    prices = pd.Series([price.text for price in prices_raw])
    
    links_raw = page_soup.select('#dp-summary-see-all-reviews')
    links = pd.Series([link.attrs['href'] for link in links_raw])
    
    df = pd.concat([titles, 
                    prices, 
                    links], axis=1)
    return df

In [11]:
LINK = list(productlist[1][0:3])

productpage = spider_multipage(LINK, prodpage_parser, SLEEP)
productpage

Unnamed: 0,0,1,2
0,\n \n \n ...,$152.16,/Nescafé-Tasters-Choice-Soluble-Orgánico/produ...
1,\n \n \n ...,$99.00,/Café-Parroquia-Veracruz-Bolsa-Molido/product-...
2,\n \n \n ...,$45.50,/Nescafe-Cafe-Olla-170-g/product-reviews/B078P...


In [24]:
def reviews_parser(links, sleep_interval):
    df = pd.DataFrame()
    user_agent = get_random_ua()
    proxy = get_random_proxy()
    headers = {'user-agent': user_agent, 
               'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
              'accept-encoding': 'gzip, deflate, br',
               'accept-language': 'es-ES,es;q=0.9,en;q=0.8,pt;q=0.7',
               'cache-control': 'max-age=0'}
    
    for el in links:
        names = pd.Series()
        titles = pd.Series()
        dates = pd.Series()
        reviews = pd.Series()
        
        response = requests.get(f'https://www.amazon.com.mx{el}', 
                                    headers=headers, 
                                    proxies=proxy)
        content = response.content
        page_soup = BeautifulSoup(content, 'html')
        
        while len(page_soup.select('#cm_cr-pagination_bar > ul > li.a-disabled.a-last'))==0:
            names_raw = page_soup.select('.a-row.product-title')
            names = names.append(pd.Series([name.text for name in names_raw]))
            
            titles_raw = page_soup.find_all('span', attrs={'data-hook':'review-title'})
            titles = titles.append(pd.Series([title.text for title in titles_raw]))

            dates_raw = page_soup.find_all('span', attrs={'data-hook':'review-date'})
            dates = dates.append(pd.Series([date.text for date in dates_raw]))
            
            reviews_raw = page_soup.find_all('span', attrs={'data-hook':'review-body'})
            reviews = reviews.append(pd.Series([review.text for review in reviews_raw]))
            
            next_raw = page_soup.select('cm_cr-pagination_bar > ul > li.a-last > a')
            next_page = [next_raw.attrs['href'] for link in next_raw]
            
            if sleep_interval > 0:
                time.sleep(sleep_interval)
            
            res = requests.get(f'https://www.amazon.com.mx{next_page}', 
                                    headers=headers, 
                                    proxies=proxy)
            cont = res.content
            page_soup = BeautifulSoup(cont, 'html')
                
        else:
            names_raw = page_soup.select('.a-row.product-title')
            names = names.append(pd.Series([name.text for name in names_raw]))
            
            titles_raw = page_soup.find_all('span', attrs={'data-hook':'review-title'})
            titles = titles.append(pd.Series([title.text for title in titles_raw]))
            
            dates_raw = page_soup.find_all('span', attrs={'data-hook':'review-date'})
            dates = dates.append(pd.Series([date.text for date in dates_raw]))
            
            reviews_raw = page_soup.find_all('span', attrs={'data-hook':'review-body'})
            reviews = reviews.append(pd.Series([review.text for review in reviews_raw]))

        df_prod = pd.concat([titles, dates, reviews])
        df_prod['ProductName'] = names[0]
        df = df.append(df_prod, ignore_index=True)

        if sleep_interval > 0:
            time.sleep(sleep_interval)
    
    df.columns = ['ReviewTitle', 'ReviewDate', 'ReviewText', 'ProductName']
    
    return df


In [28]:
LINKS_REV = productpage[2][0]


product_reviews = reviews_parser(LINKS_REV, SLEEP)
product_reviews

ConnectionError: HTTPSConnectionPool(host='www.amazon.com.mx[]', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11c3bf2b0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))