# Data collection

This notebook webscrapes and processes Amazon products based on a keyword search.

## Setup

All functions and global parameters are here

In [None]:
import pandas as pd
import random
import re
import requests
import time

from bs4 import BeautifulSoup

Setup keyword and browser ID

In [None]:
headers = {
    "accept": "*/*",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
    "Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
    "referer": "https://www.google.com/"
}

Take urls from the search results and fix them

In [None]:
def fix_url(url):
    if url[0:8] == "https://":  # a few urls already have a https for some reason
        return url
    else:
        url = re.sub(r"/ref=.*", "", url)  # if not, this adds them
        url = "https://amazon.com" + url
        return url


A function that downloads search results based on a keyword

In [None]:
def download_search(keyword):
    time.sleep(0.5 * random.random())  # wait for random amount of time to avoid suspicion
    res = requests.get("https://www.amazon.com/s?k=" + keyword, headers=headers)  # download webpage
    amazon_search = BeautifulSoup(res.content, "html.parser")  # save web contents
    results = amazon_search.find_all(
        "a", 
        {"class":"a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal", "href" : True}
    )#.text#.strip()
    products = pd.DataFrame(  # save results to a data frame
        {
            "keyword": [keyword for product in results],
            "name": [product.text for product in results],
            "url": [product["href"] for product in results]
        }
    )
    products["url"] = products["url"].apply(fix_url)
    products = products[products["url"] != "https://amazon.com/gp/slredirect/picassoRedirect.html"]
    products = products.reset_index(drop=True)  # reindex
    return products

download_search("Glass+cleaner")  # Try on one
    

A function that scrapes product pages

In [None]:
def get_product_info(url):
    # time.sleep(0.5 * random.random())  # wait for random amount of time to avoid suspicion
    print(url)
    # proxies = {"https": random.choice(proxies_list)}
    session = requests.session()  # download webpage
    res = requests.get(url, headers=headers)
    product_page = BeautifulSoup(res.content, "html.parser")  # save web contents
    return product_page
    # return product_page.find("div", {"id":"featurebullets_feature_div"}).getText() # return text information

A function that takes product description out of product page

In [None]:
def get_description(product_page):
    try:
        description = product_page.find("div", {"id": "feature-bullets"}).getText()  # get product description
    except:  # if there is no, return None
        description = None
    return description

## Scrape search results

### Try for one keyword

Set keyword. Next, download the webpage based on the keyword.

In [None]:
keyword = "Dishwasher+detergent"  # set search query

url = "https://www.amazon.com/s?k=" + keyword

url

Download

In [None]:
res = requests.get("https://www.amazon.com/s?k=" + keyword, headers=headers)  # download Amazon search results
amazon_search = BeautifulSoup(res.content, "html.parser")  # save web contents

In [None]:
print(amazon_search.prettify())  # print html code

Process results: save product name and url

In [None]:
results = amazon_search.find_all(
    "a", 
    {"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal", "href" : True}
)#.text#.strip()


In [None]:
products = pd.DataFrame(
    {
        "keyword": [keyword for product in results],
        "name": [product.text for product in results],
        "url": [product["href"] for product in results]
    }
)

products

URLs need some more changes to work

In [None]:
url = "https://aax-us-iad.amazon.com/x/c/RLnNzrLhcqJGU3XXDsy4kDIAAAGHCUwvAwEAAAH2AQBvbm9fdHhuX2JpZDEgICBOL0EgICAgICAgICAgICBdhpY6/https://www.amazon.com/dp/B082V9BTWQ?pd_rd_i=B082V9BTWQ&amp;pf_rd_p=b000e0a0-9e93-480f-bf78-a83c8136dfcb&amp;pf_rd_r=09ARC8S8T28CW0SDNWVD&amp;pd_rd_wg=wLZcc&amp;pd_rd_w=5fHz1&amp;pd_rd_r=a8e67f00-f701-4d4e-bbaf-3199ecf28b89"

url[0:8]

In [None]:
products["url"] = products["url"].apply(fix_url)

In [None]:
re.sub(r"/ref=.*", "", products.url[3])  # try printing one URL

There are some urls which do not work. Remove those for now, investigate later.

In [None]:
products = products[products["url"] != "https://amazon.com/gp/slredirect/picassoRedirect.html"]

products = products.reset_index(drop=True)  # reindex

## Scrape product descriptions

### Try if for one product

Download

In [None]:
products["url"][0]

In [None]:
# proxy = {"https": random.choice(proxies_list)}
session = requests.session()
# session.cookies.clear()
res = requests.get(products["url"][0], headers=headers) # download webpage
product_page = BeautifulSoup(res.content, "html.parser") # save web contents

# products.url[0] # print url
# print(product_page.prettify()) # print webpage code

In [None]:
products["url"][0]

In [None]:
product_page

Extract description

In [None]:
product_page.find("div", {"id": "feature-bullets"}).getText()  # get product description

### Download webpage for all products

Try for one product

In [None]:
get_product_info(products["url"][0])  # try the function for one url

Apply to all products

In [None]:
products["webpage"] = products["url"].apply(get_product_info)

Print one

### Extract descriptions

Apply to all products

In [None]:
products["product_description"] = products["webpage"].apply(get_description)

Check one description

In [None]:
products["url"][0]

## Combine to one workflow

Setup a list of search keywords to be scraped

In [None]:
keywords = [
    "Glass+cleaner",
    "Dishwasher+detergent", 
    "Machine+descaler",
    "Bleach+pods",
    "Window+spray",
    "Disinfectant+wipes",
    "Moisture+absorber",
    "Dishwasher+pods",
    "Laundry+sanitizer",
    "Washing+powder",
    "Fabric+softener",
    "Stain+remover",
    "Spot+remover",
    "Window+cleaner",
    "Bathroom+cleaner",
    "Bowl+cleaner",
    "toilet+bowl+cleaner",
    "Gel+cleaner",
    "Cleaning+powder",
    "Kitchen+cleaner",
    "Detergent+tablets",
    "Kettle+descaler",
    "bleach+tablets",
    "Disinfectant+spray",
    "Stain+eliminator",
    "Odor+lifter",
    "Soap+pods",
    "Laundry+sheets",
    "Laundry+powder"
]

Some more items generated by ChatGPT:

In [None]:
keywords_chatgpt = [
    "Toothpaste", "Toothbrush", "Mouthwash",
    "Deodorant", "Shampoo",
    "Conditioner", "Body+Wash", "Bar+Soap",
    "Floss", "Razor", "Shaving+Cream", "Face+Wash",
    "Moisturizer", "Lotion", "Sunscreen", "Hand+Sanitizer",
    "Bandages", "Hydrogen+Peroxide", "Rubbing+Alcohol",
    "Cotton+Swabs", "Contact+Lens+Solutions", "Reading+Glasses",
    "Prescription+Medications", "Vitamins", "Allergy+Medications",
    "Pain+Relievers", "Cough+Syrup", "Nasal+Spray", "Eye+Drops",
    "Thermometer", "Hot+Cold+Packs", "Insect+Repellent",
    "Antibiotic+Ointment", "Acne+Treatment", "Facial+Tissues",
    "Toilet+Paper", "Paper+Towels", "Cleaning+Spray", "Disinfecting+Wipes",
    "Laundry+Detergent",
    "Fabric+Softener", "Dish+Soap", "Sponges",
    "Trash+Bags", "Batteries",
    "Light+Bulbs", "Extension+Cords",
    "First+Aid+Kit", "Flashlight"]

keywords += keywords_chatgpt

Download in a _for_ cycle and save to a (compressed) csv file

In [None]:
for keyword in keywords:
    wait = 50 * random.random()
    print(f"...waiting {wait} seconds...")
    time.sleep(wait)  # wait for random amount of time to avoid suspicion
    print(keyword)
    headers["referer"] = "https://www.google.com/search?q=" + keyword  # change referer for search
    products = download_search(keyword)  # download search results
    headers["referer"] = "https://www.amazon.com/s?k=" + keyword  # change referer for page download
    products["webpage"] = products["url"].apply(get_product_info)  # download product descriptions
    products["product_description"] = products["webpage"].apply(get_description)
    products.to_csv(keyword + ".csv.gz")  # save results to csv

In [None]:
# products.to_csv(keyword + ".csv.gz")

## Combine data

Try loading one file

In [None]:
# pd.read_csv(f"Raw data/bleach+tablets.csv.gz", compression="gzip")

Combine csv to one dataframe

In [None]:
import os

combined_products = pd.DataFrame()

for keyword_data in os.listdir("Raw data/"):
    print(keyword_data)
    combined_products = combined_products.append(pd.read_csv(f"Raw data/{keyword_data}", compression="gzip"))


Save as a new file

In [None]:
combined_products.iloc[:, 1:].to_csv("products2.csv.gz", index=False)

### Combine two datasets

In [None]:
import numpy as np

Load data

In [None]:
products1 = pd.read_csv("products1.csv.gz")

In [None]:
products2 = pd.read_csv("products2.csv.gz")

In [None]:
products = products1.append( # create a new dataset
    products2[-products2["keyword"].isin(products1["keyword"].unique())] # select only those which are not already in products1
).copy()

Drop the whole website, which takes up a lot of memory

In [None]:
products = products.copy().drop("webpage", axis=1)

In [None]:
products.to_csv("products.csv.gz", index=False)