"""
Created on Mon Oct 2021

@author: Alex Nascimento Rodrigues
"""

In [None]:
from typing import Counter
from selenium import webdriver
import bs4
import os
import json
from time import sleep  

### Incializando webdriver ###
def web_driver_creator():     
  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--log-level=1')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  wd = webdriver.Chrome('chromedriver',options=chrome_options)
  
  return wd

### Gerando a url inicial e posteriores ###
def generate_url(search_term, page):    
  departamento = ''
      
  if search_term == 'livros':
      departamento = 'stripbooks'
  elif search_term == 'eletronicos':
      departamento = 'electronics'
  elif search_term == 'brinquedos':
      departamento = 'toys'
  
  """Generate a url from seacrh term"""
  base_template = 'https://www.amazon.com.br/s?k={}&i={}&page={}&ref=sr_pg_{}'
  search_term = search_term.replace(' ', '')
  url = base_template.format(search_term,departamento,page,page)
  
  return url
    
### Retirando os dados da primeira página ###
def extract_record(item):    
  try:
    # Nome do objeto
    target_name = item.h2.a.text.strip()    
    # url de cada objeto
    url_asin = item.get('data-asin')
      
  except AttributeError:
    return
  
  required_info = (target_name, url_asin)  
  return required_info

def target_and_urls (search_term, page_number):
    
  wd = web_driver_creator()
  search_url = generate_url(search_term, page_number)
  print(search_url)        
  wd.get(search_url)        
  soup = bs4.BeautifulSoup(wd.page_source, 'html.parser')
  
  target_data_asin = []
  results = soup.find_all('div', {'data-component-type' : 's-search-result'})
      
  # record = [Nome do objeto, url]
  for item in results:
    record = extract_record(item)
    if record:
      target_data_asin.append(record)
              
  wd.quit()  
  return(target_data_asin)

def get_text_stars_reviews (target_code,num_max_reviews):
    
  wd = web_driver_creator()
  reviews_and_stars = []
  all_user_names = []    
  all_stars = []
  all_dates = []
  all_reviews = []
  
  # 10 Reviews por página
  number_pages = (int(num_max_reviews) / 10) + 1
  
  # implementar a verificação quando não houver mais reviews
  
  for page in range(1,int(number_pages)):      
    base_url = 'https://www.amazon.com.br/product-reviews/{}/ref=cm_cr_getr_d_paging_btm_next_{}?pageNumber={}'.format(target_code,page,page)
    
    wd.get(base_url)
    soup = bs4.BeautifulSoup(wd.page_source, 'html.parser')     
    all_user_names = soup.find_all('span', {'class' : 'a-profile-name'})    
    all_reviews = soup.find_all('span', {'data-hook' : 'review-body'})    
    all_dates = soup.find_all('span', {'class' : 'a-size-base a-color-secondary review-date'})    
    all_stars = soup.find_all('i', {'data-hook' : 'review-star-rating'})
      
    index = 0      
    for index in range (0, len(all_stars)):
      try: 
        reviews_and_stars.append([all_user_names[index].text,
                                all_stars[index].text.replace('de 5 estrelas','').strip(),
                                all_dates[index].text.replace('Avaliado no Brasil em','').strip(),
                                all_reviews[index].text.replace('Your browser does not support HTML5 video.','').strip()])

      except AttributeError:
        return
    sleep(3)
      
  wd.quit()
  return reviews_and_stars

def extract_empty_items(num_max_reviews, num_max_targets,
                        file_existing_items, results_file, target_name):
  dicio = {}
  list_collected_reviews = []
  results = []
  index_max_targets = 0
  index_list_target = 0
  page = 1
      
  results = target_and_urls(target_name, page)
          
  list_target_names = []
  list_target_names = [item[0] for item in results]
  list_target_codes = []
  list_target_codes = [item[1] for item in results]
  
  for index_max_targets in range(0,num_max_targets):
    # A cada 22 items coletados uma nova página é carregada
    if (index_max_targets % 22 == 0 and index_max_targets != 0):            
      page += 1
      
      try:
        results = target_and_urls(target_name, page)
        
        list_target_names = []
        list_target_names = [item[0] for item in results]
        list_target_codes = []
        list_target_codes = [item[1] for item in results]
        print("--- Coletando na página:",page, "---")
        index_list_target = 0
        sleep(3)
      except:
        print("Erro na coleta de nomes e códigos!")
        break
    
    print("Coletando", index_max_targets + 1, "de", num_max_targets)
    
    try:
        with open(results_file, 'r+', encoding='utf8') as json_file:
          dicio[list_target_names[index_list_target]] = get_text_stars_reviews(list_target_codes[index_list_target],num_max_reviews)
          if (index_max_targets == 0):
            data = {}
          else:
            data = json.load(json_file)      
              
          data.update(dicio)
          json_file.seek(0)
          json.dump(data, json_file, ensure_ascii=False)                                             
          list_collected_reviews.append(list_target_names[index_list_target])
    
        index_list_target += 1
        dicio = {}
        
    except:
      print("Erro na coleta de nomes e códigos!")
      break
  
  with open(file_existing_items, 'w', encoding='utf8') as json_file:
    json.dump(list_collected_reviews, json_file, ensure_ascii=False)
    
def extract_existing_items(num_max_reviews, num_max_targets,
                           file_existing_items, results_file, target_name):
  dicio = {}
  parsed_json = []
  
  # Contém os livros que já foram coletados
  with open(file_existing_items, 'r', encoding='utf8') as json_file:
    parsed_json = json.load(json_file)
    json_file.close()                

  # Uso do set para diminuir a complexidade
  set_target = set(parsed_json)
  
  page = 1
  results = target_and_urls(target_name, page)
  
  list_target_names = []
  list_target_names = [item[0] for item in results]
  
  list_target_codes = []
  list_target_codes = [item[1] for item in results]
  
  index_max_targets = 0
  index_list_target = 0
  existing_cont = num_max_targets
  new_item_cont = 1
      
  while index_max_targets < num_max_targets:
  # A cada 22 items coletados uma nova página é carregada
    if (index_max_targets % 22 == 0 and index_max_targets != 0):
      page += 1
      try:
        results = target_and_urls(target_name, page)
        list_target_names = []
        list_target_names = [item[0] for item in results]
        
        list_target_codes = []
        list_target_codes = [item[1] for item in results]
        print("Coletando na página:",page)
        index_list_target = 0
        sleep(3)
      except:
        print("Erro na coleta de nomes e códigos!")
        break
    
    print("Coletando", new_item_cont, "de", existing_cont)
    
    try:
      if list_target_names[index_list_target] not in set_target:
        with open(results_file, 'r+', encoding='utf8') as json_file:
          dicio[list_target_names[index_list_target]] = get_text_stars_reviews(list_target_codes[index_list_target],num_max_reviews)
          data = json.load(json_file)
          data.update(dicio)
          json_file.seek(0)
          json.dump(data, json_file, ensure_ascii=False)
          json_file.close()
                
        set_target.add(list_target_names[index_list_target])            
        
        new_item_cont += 1
        dicio = {}
      else:
        print("O item ->",list_target_names[index_list_target],"<- já foi coletado")
        num_max_targets += 1
  
      index_max_targets += 1
      index_list_target += 1
    except:
      print("Erro na coleta de reviews!")
      break
  
  with open(file_existing_items, 'w', encoding='utf8') as json_file:
    json.dump(list(set_target), json_file, ensure_ascii=False)
    json_file.close()
    

def find_reviews (target_name, num_max_targets, num_max_reviews):        
  file_existing_items = target_name + '_Items.json'
  results_file = target_name + '_Results.json'
  
  if (os.path.isfile(file_existing_items) == False):
    arquivo = open(file_existing_items, "w+")
    arquivo.close()
  
  if (os.path.isfile(results_file) == False):
    arquivo = open(results_file, "w+")
    arquivo.close()
  
  # Items.json vazio
  if os.stat(file_existing_items).st_size == 0:
    extract_empty_items(num_max_reviews, num_max_targets,
                        file_existing_items, results_file, target_name)
  # Items.json preenchido
  else:
    extract_existing_items(num_max_reviews, num_max_targets,
                            file_existing_items, results_file, target_name)
  
  print("--FIM--")


# Mapeamento #
# Buscas disponíveis: livros, eletronicos e brinquedos
# Primeiro índice = target
# Segundo índice = Um review contendo o texto e avaliação por estrela
# Terceiro índice = autor do review[0],  estrela[1], data[2], review[3],
# print(reviews_and_stars[0][0][3])

# Busca por 50 livros contendo 20 reviews de cada no máximo
# Ex.: find_reviews (livros, 50, 20)
# Número de reviews deve ser múltiplo de 10

find_reviews('livros',20,50)

find_reviews('eletronicos',20,50)

find_reviews('brinquedos',20,50)