<a href="https://colab.research.google.com/github/ayoubbensakhria/finance_algo/blob/master/Product_Reviews_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon USA Reviews Summarizer and content builder (CSV for WP All Import)

#1. Install required packages and do necessary imports

In [None]:
# Setting-up Environment & Importing necessary libraries
!pip install selenium
!apt-get update
!apt install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin


from selenium import webdriver
from bs4 import BeautifulSoup
from shutil import make_archive
from google.colab import drive
from lxml import html
from lxml.html.clean import clean_html, Cleaner
import pandas as pd
import json
import time
import random

# Setting-up web driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.2.0-py3-none-any.whl (983 kB)
[K     |████████████████████████████████| 983 kB 4.9 MB/s 
[?25hCollecting trio~=0.17
  Downloading trio-0.21.0-py3-none-any.whl (358 kB)
[K     |████████████████████████████████| 358 kB 57.5 MB/s 
[?25hCollecting urllib3[secure,socks]~=1.26
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 65.2 MB/s 
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)
Collecting pyOpenSSL>=0.14
  Downloading pyO

#2. **Scraping config and functions**


In [None]:
# Mount drive
drive.mount('/content/drive/')
base_url = "https://www.amazon.co.uk"
dataframe = pd.DataFrame(columns=['product_title', 'customer_reviews_rate', 'offered_price','marked_price', 'is_prime_product', 
                                  'product_type', 'product_url', 'product_details','brand_img', 'table_features', 'faq', 'reviews', 'product_image', 'reviews_compilation'])

# Posts random dates
start_date = "2022-07-17 1:30:10"
end_date = "2022-08-16 13:30:10"

# authors count 
authors_count = 7

# products count
pdct_count = 4

# file name
filename = "amazon_pdcts_export_07"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
def str_time_prop(start, end, format, prop):
    """Get a time at a proportion of a range of two formatted times.
    start and end should be strings specifying times formated in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))
    ptime = stime + prop * (etime - stime)
    return time.strftime(format, time.localtime(ptime))

def random_date(start, end, prop):
    return str_time_prop(start, end, '%Y-%m-%d %H:%M:%S', prop)

# get product details
def get_details(product_url):
  try:
    driver.get(product_url)
    webpage = driver.execute_script("return document.body.outerHTML;")
    soup = BeautifulSoup(webpage, "html.parser")
    brand_img = ''
    table_features = ''
    reviews = ''
    reviews_compilation = ''
    table_features = ''
    product_details = ''
    faq = ''
    product_image = ''
    json_reviews = None

    try:
      brand_img = soup.find("img", attrs={"id": "logoByLine"}).attrs["src"]
    except:
      pass
    try:
      table_features = soup.find("table", attrs={"class":"a-normal a-spacing-micro"}).prettify()
    except:
      pass
    try:
      reviews = soup.find("div", attrs={"class": "a-section a-spacing-large reviews-content filterable-reviews-content celwidget"}).prettify()
    except:
      pass
    try:
      product_details = soup.find("div", attrs={"data-feature-name": "detailBullets"}).prettify()           
    except:
      _product_details = soup.find("div", attrs={"data-feature-name": "productDescription"})
      if(_product_details):
        product_details = _product_details.prettify()
    finally:
      product_details = ''
    try:
      faq = soup.find("div", attrs={"span": "askTopQandA"}).prettify()
    except:
      pass
    try:
      product_image = soup.find("div", attrs={"class": "imgTagWrapper"}).find("img", attrs={"data-a-image-name": "landingImage"}).attrs["src"]
    except:
      pass
    try:
      json_reviews, reviews_compilation = reviews_to_json(BeautifulSoup(reviews))
      if len(reviews_compilation)>1500:
        reviews_compilation = reviews_compilation[0:1500]
    except:
      pass
    print (product_image)
  except:
    pass
    
  return brand_img, table_features, json_reviews, reviews_compilation, product_details, product_image

# get html from search result page 
def get_results(keyword, numbers_of_pages, tags, category, brand):
  dataframe = pd.DataFrame(columns=['product_title', 'customer_reviews_rate', 'offered_price','marked_price', 'is_prime_product', 
                                      'product_type', 'product_url', 'product_details','brand_img', 'table_features', 'faq', 'reviews', 
                                    'product_image', 'reviews_compilation'])  
  for page in range(1, numbers_of_pages + 1):
    try:
      query = keyword.replace(" ", "+")
      search_url = base_url + "/s?k={}&page={}&ref=nb_sb_noss".format(query, page)
      driver.get(search_url)
      webpage = driver.execute_script("return document.body.outerHTML;")
      soup = BeautifulSoup(webpage, "html.parser")

      # Extracting the average customer reviews
      try:
        result_div = soup.findAll("div", attrs={"data-component-type": "s-search-result"})
        counter = 1
        for product_detail in result_div:
          if counter > pdct_count: break
          counter += 1
          product_title = ""
          product_url = ""
          customer_reviews_rate = ""
          offered_price = ""
          marked_price =""
          is_prime_product = ""
          product_type = ""
          brand_img = ""
          table_features = ""
          reviews = ""
          reviews_compilation = ""
          table_features = ""
          product_details = ""
          faq = ""
          product_image = ""
          json_reviews = None
          # Fetching Product Title
          try:
            product_title = product_detail.find("h2", attrs={"class": "a-size-mini a-spacing-none a-color-base s-line-clamp-3"}).text
          except:
            # Exception for no product title
            pass

          # Extracting the product URL
          try:
            partial_product_url = product_detail.find("a", attrs={"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"}).attrs["href"]
            product_url = base_url + partial_product_url

          except:
            # Exception for no product url
            pass

          # Fetching Customer Rating, if available
          try:
            customer_reviews_rate = product_detail.find("div", attrs={"class": "a-row a-size-small"}).text
          except:
            # Exception for no previous reviews on the product
            customer_reviews_rate = "Not Available"
          
          # Fetching Offer Price
          try:
            offered_price = product_detail.find("span", attrs={"class": "a-price"}).find("span", attrs={"class": "a-offscreen"}).text
            if offered_price == '' : offered_price = "Not Available"
          except:
            # Exception for no offers
            offered_price = "Not Available"

          # Fetching Marked Price
          try:
            marked_price = product_detail.find("span", attrs={"class": "a-price a-text-price"}).find("span", attrs={"class": "a-offscreen"}).text
            if marked_price == '' : marked_price = "Not Available"
          except:
            # Exception for no offers
            marked_price = "Not Available"
          
          # Fetching Prime availability
          try:
            product_detail.find("span", attrs={"class": "aok-relative s-icon-text-medium s-prime"})
            is_prime_product = "Yes"
          except:
            is_prime_product = "No"
          
          # Fetching Product Type (sponsered or generic)
          try:
            product_detail.find("div", attrs={"class": "a-row a-spacing-micro"}).text
            product_type = "Sponsored"
          except:
            product_type = "Generic"
          
          # get details
          brand_img, table_features, json_reviews, reviews_compilation, product_details, product_image = get_details(product_url)
          # Appending DataFrame with new product entry

          dataframe = dataframe.append({
              'product_title': product_title,
              'customer_reviews_rate': customer_reviews_rate,
              'offered_price': offered_price,
              'marked_price': marked_price,
              'is_prime_product': is_prime_product,
              'product_type': product_type,
              'product_url': product_url,
              'product_details': product_details,
              'brand_img': brand_img,
              'table_features': table_features,
              'faq': '',
              'reviews': json_reviews,
              'product_image': product_image,
              'reviews_compilation': reviews_compilation
              }, ignore_index = True)
          
      except ValueError:
        # Exception for div find failure
        print(ValueError)
      time.sleep(1)
    except ValueError:
      # Exception for No Results for search or page
      print(ValueError)
  dataframe['tags'] = tags
  dataframe['category'] = category
  dataframe['brand'] = brand
  dataframe['keyword'] = keyword

  return dataframe

# reviews HTML to json
def reviews_to_json(SoupObj):
  reviews = []
  reviews_html = SoupObj.findAll("div", attrs={"data-hook": "review"})
  reviews_compilation = ''
  for ele in reviews_html:
    rev = {}
    rev['vote_statement'] = "Not specified"
    rev['author'] = ele.find("span", attrs={"class": "a-profile-name"}).text
    rev['rating'] = ele.find("i", attrs={"data-hook": "review-star-rating"}).text
    rev['date'] = ele.find("span", attrs={"data-hook": "review-date"}).text
    rev['title'] = ele.find("a", attrs={"data-hook": "review-title"}).text
    rev['form'] = ele.find("div", attrs={"class": "a-row a-spacing-mini review-data review-format-strip"}).text
    #rev['verified'] = ele.find("span", attrs={"data-hook": "avp-badge-linkless"}).text
    body = ele.find("span", attrs={"data-hook": "review-body"}).text.replace("Read more", "")
    rev['review_body'] = "".join([s for s in body.strip().splitlines(True) if s.strip("\r\n").strip()])
    reviews_compilation = reviews_compilation + rev['review_body']
    try:
      rev['vote_statement'] = ele.find("span", attrs={"data-hook": "review-voting-widget"}).text
    except:
      pass
    reviews.append(rev)
  return json.dumps(reviews), reviews_compilation

# Questions HTML to json
def questions_to_json(SoupObj):
  questions= []
  questions_html = SoupObj.findAll("div", attrs={"class": "a-fixed-left-grid a-spacing-base"})
  for ele in questions_html:
    q = {}
    q['question'] = ele.find("span", attr={"class": "a-declarative"}).text
    q['answers'] = ele.find("a", attr={"class": "a-fixed-left-grid-col a-col-right"}).text
    questions.append(q)
  return json.dumps(questions)

# convert reviews json to custom html
def json_to_html(json_objs):
  html = ''
  for obj in json_objs:
    html_obj = ''
    html_obj += '<h3>{title}</h3>'.format(title=obj['title'])
    html_obj += '<h4>{author} has given ★ {rating}</h4>'.format(author=obj['author'], rating=obj['rating'])
    if obj['form']: 
      html_obj += '<p>Purchased Form/Model: {form}</p>'.format(form=obj['form'])
    html_obj += '<p>Review date: <strong>{date}</strong> | Vote statement: <em>{vote_statement}</em></p>'.format(date=obj['date'], vote_statement=obj['vote_statement'])
    html_obj += '<p>{review_body}</p>'.format(review_body=obj['review_body'])
    html += html_obj
  return html

#3. Scraping

In [None]:
df =  pd.read_csv("/content/drive/MyDrive/data/content/aplist.csv")
df.head()

Unnamed: 0,keyword,brand,tags,category
0,l citruline,-,,Skin care
1,stinger detox,STINGER THE BUZZ,"stinger detox, detox",Health
2,aveda dandruff treatment,Aveda,"aveda dandruff treatment, dandruff, aveda",Hair
3,loreal anti dandruff treatment,L'Oréal,"loreal anti dandruff treatment, L'Oréal, dandruff",Skin care
4,dht blocking shampoo,-,"dht blocking shampoo, dht, shampoo",Hair


In [None]:
for index, row in df.iterrows():
  dataframe = dataframe.append(get_results(row['keyword'], 1, row['tags'], row['category'], row['brand'] ))

https://m.media-amazon.com/images/I/41tZTjZYDmL._AC_.jpg
https://m.media-amazon.com/images/I/51XF7cwsFFL._AC_SY450_.jpg
https://m.media-amazon.com/images/I/71XTgb3aX+L._AC_SY879_.jpg
https://m.media-amazon.com/images/I/71dSmjvh0SL._AC_SY879_.jpg
https://m.media-amazon.com/images/I/71P3N19lbeL._AC_SY450_.jpg
https://m.media-amazon.com/images/I/412kdMzVEJL._AC_.jpg

https://m.media-amazon.com/images/I/61onNy1ZgKL._AC_SX425_.jpg
https://m.media-amazon.com/images/I/71wPig4adJL._AC_SX425_.jpg
https://m.media-amazon.com/images/I/81T17gFyANL._AC_SX425_.jpg
https://m.media-amazon.com/images/I/41Umm3sInAL._AC_SX425_.jpg
https://m.media-amazon.com/images/I/615GOwv-NrL._AC_SY450_.jpg
https://m.media-amazon.com/images/I/61CMvYaoDGL._AC_SY879_.jpg
https://m.media-amazon.com/images/I/51U06tvahEL._AC_SX425_.jpg
https://m.media-amazon.com/images/I/41DOBOXSj1L._AC_SX342_.jpg
https://m.media-amazon.com/images/I/61p3Vc0Np-L._AC_SX342_.jpg
https://m.media-amazon.com/images/I/51nhGqv2lHL._AC_SX425_PIbundle

In [None]:
dataframe

Unnamed: 0,product_title,customer_reviews_rate,offered_price,marked_price,is_prime_product,product_type,product_url,product_details,brand_img,table_features,faq,reviews,product_image,reviews_compilation,tags,category,brand,keyword
0,Vita World 3 Pack L-Citrulline 500mg 360 Veget...,5.0 out of 5 stars 3,£39.90,Not Available,Yes,Sponsored,https://www.amazon.co.uk/gp/slredirect/picasso...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n\n\n\n H...",https://m.media-amazon.com/images/I/41tZTjZYDm...,First class product at very good price\n ...,,Skin care,-,l citruline
1,Vita World Pack of 3 L-Citrulline 1000 mg High...,2.0 out of 5 stars 1,£72.50,Not Available,Yes,Sponsored,https://www.amazon.co.uk/gp/slredirect/picasso...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n\n\n\n H...",https://m.media-amazon.com/images/I/51XF7cwsFF...,Tablets look smaller then 1000gram and to be h...,,Skin care,-,l citruline
2,NOW 750mg L-Citrulline 90 Capsules,4.4 out of 5 stars 70,£23.93,Not Available,Yes,Generic,https://www.amazon.co.uk/L-Citrulline-750-90-C...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n\n\n\n H...",https://m.media-amazon.com/images/I/71XTgb3aX+...,I have used 1 of these capsules together with ...,,Skin care,-,l citruline
3,"Now Foods L-Citrulline Pure Powder, 145 g",4.1 out of 5 stars 7,Not Available,Not Available,Yes,Generic,https://www.amazon.co.uk/L-Citrulline-100-Pure...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n\n\n\n H...",https://m.media-amazon.com/images/I/71dSmjvh0S...,DID NOT FEEL ENERGY SURGE,,Skin care,-,l citruline
0,Stinger 1-Hour Detox Liquid Drink 5x Strength ...,"3.9 out of 5 stars 1,923",£59.77,£75.35,Yes,Generic,https://www.amazon.co.uk/Stinger-1-Hour-Liquid...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,[],https://m.media-amazon.com/images/I/71P3N19lbe...,,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox
1,Ortisan | Pure Plan Drainage - Apple Flavour |...,4.3 out of 5 stars 247,£15.25,Not Available,Yes,Generic,https://www.amazon.co.uk/Ortisan-Pure-Plan-Dra...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n 10 people ...",https://m.media-amazon.com/images/I/412kdMzVEJ...,Love it. I do this every 3 months.\n ...,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox
2,,Not Available,£9.03,Not Available,Yes,Generic,https://www.amazon.co.uk/Thriving-Blood-Sugar-...,,,,,[],,,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox
3,Liver Cleanse Detox and Gallbladder Repair Cle...,Not Available,£89.90,Not Available,Yes,Generic,https://www.amazon.co.uk/Liver-Cleanse-Gallbla...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,[],https://m.media-amazon.com/images/I/61onNy1ZgK...,,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox
0,Neutrogena T/Gel Therapeutic Shampoo Treatment...,"4.6 out of 5 stars 23,606",£3.74,£6.49,Yes,Generic,https://www.amazon.co.uk/Neutrogena-Therapeuti...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",,"[{""vote_statement"": ""\n\n\n 818 people...",https://m.media-amazon.com/images/I/71wPig4adJ...,I am so thankful I tried this product!! I had ...,"aveda dandruff treatment, dandruff, aveda",Hair,Aveda,aveda dandruff treatment
1,"Australian Bodycare Scalp Serum 150ml for Dry,...",4.2 out of 5 stars 835,£12.99,Not Available,Yes,Generic,https://www.amazon.co.uk/Australian-Bodycare-D...,,,,,"[{""vote_statement"": ""\n\n\n 14 people ...",https://m.media-amazon.com/images/I/81T17gFyAN...,The media could not be loaded.\n I ha...,"aveda dandruff treatment, dandruff, aveda",Hair,Aveda,aveda dandruff treatment


# 4. Reviews compilation for summary and count

In [None]:
# count
dataframe['reviews_count'] = dataframe.apply(lambda row: row['customer_reviews_rate'].split(' ')[5] if '5' in row['customer_reviews_rate'] else 0 , axis =1)
dataframe['reviews_score'] = dataframe.apply(lambda row: row['customer_reviews_rate'].split(' ')[0] if '5' in row['customer_reviews_rate'] else 0, axis =1)

cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter

# Fillin blank reviews 
dataframe['reviews_compilation'] = dataframe.apply(lambda row: '{customer_reviews_rate} based on {count} reviews from verified customers'.format(customer_reviews_rate=row['customer_reviews_rate'], count=row['reviews_count']) if not row['reviews_compilation'] else row['reviews_compilation'], axis =1)

# Clean HTML
dataframe['product_details'] = dataframe.apply(lambda row: '' if not row['product_details'] else cleaner.clean_html(row['product_details']), axis =1)
dataframe['table_features'] = dataframe.apply(lambda row: '' if not row['table_features'] else cleaner.clean_html(row['table_features']), axis =1)

# pub date
dataframe['pub_date'] = dataframe.apply(lambda row: random_date(start_date, end_date, random.random()), axis =1)

# author 
dataframe['author'] = dataframe.apply(lambda row: random.randint(1,authors_count), axis =1)

# Reviews HTML
dataframe['reviews_html'] = dataframe.apply(lambda row: json_to_html(json.loads(row['reviews'].replace(r'\n', ''))), axis =1)

In [None]:
dataframe.head(20)

Unnamed: 0,product_title,customer_reviews_rate,offered_price,marked_price,is_prime_product,product_type,product_url,product_details,brand_img,table_features,...,reviews_compilation,tags,category,brand,keyword,reviews_count,reviews_score,pub_date,author,reviews_html
0,Vita World 3 Pack L-Citrulline 500mg 360 Veget...,5.0 out of 5 stars 3,£39.90,Not Available,Yes,Sponsored,https://www.amazon.co.uk/gp/slredirect/picasso...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,First class product at very good price\n ...,,Skin care,-,l citruline,3,5.0,2022-07-25 22:24:36,4,<h3> Vita world is a 100% outfit ...
1,Vita World Pack of 3 L-Citrulline 1000 mg High...,2.0 out of 5 stars 1,£72.50,Not Available,Yes,Sponsored,https://www.amazon.co.uk/gp/slredirect/picasso...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,Tablets look smaller then 1000gram and to be h...,,Skin care,-,l citruline,1,2.0,2022-07-31 17:06:54,6,<h3> Not impressed </h3><h4> ...
2,NOW 750mg L-Citrulline 90 Capsules,4.4 out of 5 stars 70,£23.93,Not Available,Yes,Generic,https://www.amazon.co.uk/L-Citrulline-750-90-C...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,I have used 1 of these capsules together with ...,,Skin care,-,l citruline,70,4.4,2022-07-28 08:36:57,2,<h3> Boosts my Nitric Oxide </h3>...
3,"Now Foods L-Citrulline Pure Powder, 145 g",4.1 out of 5 stars 7,Not Available,Not Available,Yes,Generic,https://www.amazon.co.uk/L-Citrulline-100-Pure...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,DID NOT FEEL ENERGY SURGE,,Skin care,-,l citruline,7,4.1,2022-07-25 13:27:53,4,<h3> NOTHING TO SAY </h3><h4> ...
0,Stinger 1-Hour Detox Liquid Drink 5x Strength ...,"3.9 out of 5 stars 1,923",£59.77,£75.35,Yes,Generic,https://www.amazon.co.uk/Stinger-1-Hour-Liquid...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,"3.9 out of 5 stars 1,923 based on 1,923 revi...","stinger detox, detox",Health,STINGER THE BUZZ,stinger detox,1923,3.9,2022-08-10 12:46:33,6,
1,Ortisan | Pure Plan Drainage - Apple Flavour |...,4.3 out of 5 stars 247,£15.25,Not Available,Yes,Generic,https://www.amazon.co.uk/Ortisan-Pure-Plan-Dra...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,Love it. I do this every 3 months.\n ...,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox,247,4.3,2022-07-27 03:22:16,3,<h3> It's work if you do it right ...
2,,Not Available,£9.03,Not Available,Yes,Generic,https://www.amazon.co.uk/Thriving-Blood-Sugar-...,,,,...,Not Available based on 0 reviews from verified...,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox,0,0.0,2022-08-13 09:30:33,6,
3,Liver Cleanse Detox and Gallbladder Repair Cle...,Not Available,£89.90,Not Available,Yes,Generic,https://www.amazon.co.uk/Liver-Cleanse-Gallbla...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,Not Available based on 0 reviews from verified...,"stinger detox, detox",Health,STINGER THE BUZZ,stinger detox,0,0.0,2022-07-24 19:31:58,3,
0,Neutrogena T/Gel Therapeutic Shampoo Treatment...,"4.6 out of 5 stars 23,606",£3.74,£6.49,Yes,Generic,https://www.amazon.co.uk/Neutrogena-Therapeuti...,,,"<table class=""a-normal a-spacing-micro"">\n <tb...",...,I am so thankful I tried this product!! I had ...,"aveda dandruff treatment, dandruff, aveda",Hair,Aveda,aveda dandruff treatment,23606,4.6,2022-07-23 01:46:00,7,<h3> Amazing for Psoriasis and Dermatit...
1,"Australian Bodycare Scalp Serum 150ml for Dry,...",4.2 out of 5 stars 835,£12.99,Not Available,Yes,Generic,https://www.amazon.co.uk/Australian-Bodycare-D...,,,,...,The media could not be loaded.\n I ha...,"aveda dandruff treatment, dandruff, aveda",Hair,Aveda,aveda dandruff treatment,835,4.2,2022-07-30 20:20:23,5,<h3> Cooling and soothing. </h3><...


In [None]:
dataframe.to_csv('/content/drive/MyDrive/data/content/{filename}.csv'.format(filename=filename), sep = ';')


# 5. Summarize reviews

In [None]:
## if interrupted
#import pandas as pd
#filename = "amazon_pdcts_export_07"
#dataframe = pd.read_csv('/content/drive/MyDrive/data/content/{filename}.csv'.format(filename=filename), sep = ';')

In [None]:
# summarization
!pip install transformers
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 6.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 32.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
dataframe['reviews_summary'] = dataframe.apply(lambda row: summarizer(row['reviews_compilation'], 
                                                                      max_length=350, min_length=30, 
                                                                      do_sample=False, truncation=True)[0]['summary_text'],
                                                                      axis=1)

Your max_length is set to 350, but you input_length is only 64. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Your max_length is set to 350, but you input_length is only 22. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max_length is set to 350, but you input_length is only 125. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 350, but you input_length is only 11. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 350, but you input_length is only 25. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 350, but you input_length is only 11. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 350, but you input_length is only 11. You might consider 

# 6. Save Data

In [None]:
# save to csv
dataframe.to_csv('/content/drive/MyDrive/data/content/{filename}.csv'.format(filename=filename), sep = ';')
