In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv

# Function to scrape historical reviews
def scrape_historical_reviews(url):
    reviews = []
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        product_name_element = soup.find('div', class_='product-page-title')
        if product_name_element:
            product_name = product_name_element.text.strip()

            review_elements = soup.find_all('div', class_='product-ratings')
            for element in review_elements:
                rating = element.find('div', class_='product-rating').text.strip()
                review_text = element.find('div', class_='shopee-product-rating__content').text.strip()
                reviews.append([product_name, url, "Historical", rating, review_text])

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return reviews

# Function to scrape day-to-day new reviews
def scrape_new_reviews(url):
    reviews = []
    try:
        response = requests.get(url + '/rating')
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        product_name_element = soup.find('div', class_='product-page-title')
        if product_name_element:
            product_name = product_name_element.text.strip()

            review_elements = soup.find_all('div', class_='shopee-product-comment')
            for element in review_elements:
                review_text = element.find('div', class_='shopee-product-comment__content').text.strip()
                reviews.append([product_name, url, "New", "", review_text])

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return reviews

product_urls = [
    "https://shopee.tw/%E3%80%90%E9%80%A2%E7%94%B2FUZZY%E3%80%91Nike-Dunk-Low-%E9%BB%91%E7%99%BD-%E7%86%8A%E8%B2%93-DD1391-100-DD1503-101-CW1590-i.6783271.6874570040?sp_atk=a11997b1-2f60-484c-b2d1-bd2914d2bc9f&xptdk=a11997b1-2f60-484c-b2d1-bd2914d2bc9f/",
    # "https://shopee.tw/product/9241878/12687797847",
]

csv_filename = "shopee_reviews.csv"

with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["Radarly pid", "Radarly corpusId", "Radarly corpusName", "name", "url", "Id"])

    for url in product_urls:
        historical_reviews = scrape_historical_reviews(url)
        new_reviews = scrape_new_reviews(url)

        for review in historical_reviews:
            csv_writer.writerow([6127, 64143, "Feminine Care", review[0], review[1], ""])

        for review in new_reviews:
            csv_writer.writerow([6127, 64143, "Feminine Care", review[0], review[1], ""])

print(f"Scraping completed. Data saved in '{csv_filename}'.")


In [None]:
df=pd.read_csv("shopee_reviews.csv")
df

In [None]:
# ... (previous code)

data = []
timestamp_last_scrape = "2023-10-01 16:29"
new_reviews = []  # Initialize a list to store the latest reviews

if timestamp_last_scrape is None:
    for url in product_urls:
        historical_reviews = scrape_historical_reviews(url)

        for review in historical_reviews:
            data.append({
                "Radarly pid": 6127,
                "Radarly corpusId": 64143,
                "Radarly corpusName": "Feminine Care",
                "Product_Name": review[0],
                "Product_url": review[1],
                "Status": review[2],
                "Name": review[3],
                "Customer_Profile_Url": review[4],
                "Rating": review[5],
                "TimeStamps": review[6],
                "Review": review[7],
                "Media_Url": review[8],
                "Likes": review[9],
                "Id": ""
            })

    with open("past_reviews.json", 'w', encoding='utf-8') as past_json_file:
        json.dump(data, past_json_file, ensure_ascii=False, indent=4)

    with open("latest_reviews.json", 'w', encoding='utf-8') as latest_json_file:
        json.dump(data, latest_json_file, ensure_ascii=False, indent=4)

    current_datetime = dt.datetime.now()
    timestamp_last_scrape = current_datetime.strftime("%Y-%m-%d %H:%M")
    
    print(f"Scraping completed. Data saved in 'past_reviews.json' and 'latest_reviews.json'.")
else:
    with open("past_reviews.json", 'r', encoding='utf-8') as past_json_file:
        past_reviews_data = json.load(past_json_file)

    for url in product_urls:
        latest_reviews = scrape_historical_reviews(url)
        for review in latest_reviews:
            
            full_timestamp = review[6]
            timestamp_match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', full_timestamp)
            if timestamp_match:
                timestamp_part = timestamp_match.group(0)
            else:
                continue
                
            try:    
                timestamp_part = dt.datetime.strptime(timestamp_part, '%Y-%m-%d %H:%M')

                if timestamp_part > timestamp_last_scrape:
                    new_reviews.append({
                        "Radarly pid": 6127,
                        "Radarly corpusId": 64143,
                        "Radarly corpusName": "Feminine Care",
                        "Product_Name": review[0],
                        "Product_url": review[1],
                        "Status": review[2],
                        "Name": review[3],
                        "Customer_Profile_Url": review[4],
                        "Rating": review[5],
                        "TimeStamps": timestamp_part.strftime("%Y-%m-%d %H:%M"),  # Format it as a string
                        "Review": review[7],
                        "Media_Url": review[8],
                        "Likes": review[9],
                        "Id": ""
                    })
            except ValueError:
                print(f"Error parsing timestamp: {timestamp_part}")
                continue
    
    with open("latest_reviews.json", 'w', encoding='utf-8') as latest_json_file:
        json.dump(new_reviews, latest_json_file, ensure_ascii=False, indent=4)

    print(f"Scraping completed. Latest data (not in past_reviews.json) saved in 'latest_reviews.json'.")


In [None]:
import os  # Import the os module to manage directories

# ... (previous code)

# Define a function to create a folder for a specific URL
def create_folder(url):
    folder_name = re.sub(r'[^\w\s-]', '', url)  # Remove special characters from URL to create a folder name
    folder_name = folder_name.replace(' ', '_')  # Replace spaces with underscores
    folder_path = os.path.join(os.getcwd(), folder_name)  # Create a folder path based on the current directory
    os.makedirs(folder_path, exist_ok=True)  # Create the folder if it doesn't exist
    return folder_path

data = []
timestamp_last_scrape = "2023-10-1 16:29"

if timestamp_last_scrape is not None:
    timestamp_last_scrape = dt.datetime.strptime(timestamp_last_scrape, '%Y-%m-%d %H:%M')

for url in product_urls:
    folder_path = create_folder(url)  # Create a folder for each URL

    if timestamp_last_scrape is None:
        historical_reviews = scrape_historical_reviews(url)

        for review in historical_reviews:
            data.append({
                "Radarly pid": 6127,
                "Radarly corpusId": 64143,
                "Radarly corpusName": "Feminine Care",
                "Product_Name": review[0],
                "Product_url": review[1],
                "Status": review[2],
                "Name": review[3],
                "Customer_Profile_Url": review[4],
                "Rating": review[5],
                "TimeStamps": review[6],
                "Review": review[7],
                "Media_Url": review[8],
                "Likes": review[9],
                "Id": ""
            })

        # Save the data in the folder
        with open(os.path.join(folder_path, "past_reviews.json"), 'w', encoding='utf-8') as past_json_file:
            json.dump(data, past_json_file, ensure_ascii=False, indent=4)

        with open(os.path.join(folder_path, "latest_reviews.json"), 'w', encoding='utf-8') as latest_json_file:
            json.dump(data, latest_json_file, ensure_ascii=False, indent=4)

        current_datetime = dt.datetime.now()
        timestamp_last_scrape = current_datetime.strftime("%Y-%m-%d %H:%M")

        print(f"Scraping completed for URL '{url}'. Data saved in '{folder_path}' folder.")
    else:
        with open(os.path.join(folder_path, "past_reviews.json"), 'r', encoding='utf-8') as past_json_file:
            past_reviews_data = json.load(past_json_file)

        for review in latest_reviews:
            full_timestamp = review[6]
            timestamp_match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', full_timestamp)
            if timestamp_match:
                timestamp_part = timestamp_match.group(0)
            else:
                continue

            try:
                timestamp_part = dt.datetime.strptime(timestamp_part, '%Y-%m-%d %H:%M')
                if timestamp_part > timestamp_last_scrape:
                    data.append({
                        "Radarly pid": 6127,
                        "Radarly corpusId": 64143,
                        "Radarly corpusName": "Feminine Care",
                        "Product_Name": review[0],
                        "Product_url": review[1],
                        "Status": review[2],
                        "Name": review[3],
                        "Customer_Profile_Url": review[4],
                        "Rating": review[5],
                        "TimeStamps": timestamp_part.strftime("%Y-%m-%d %H:%M"),
                        "Review": review[7],
                        "Media_Url": review[8],
                        "Likes": review[9],
                        "Id": ""
                    })
            except ValueError:
                print(f"Error parsing timestamp: {timestamp_part}")
                continue

        # Save the latest data in the folder
        with open(os.path.join(folder_path, "latest_reviews.json"), 'w', encoding='utf-8') as latest_json_file:
            json.dump(data, latest_json_file, ensure_ascii=False, indent=4)

        print(f"Scraping completed for URL '{url}'. Latest data (not in past_reviews.json) saved in '{folder_path}' folder.")
