# Webscrape

### Simple (initial) Version

In [2]:
# Simple Version
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://www.erowid.org/experiences/exp.cgi?S1=2')

data_list=[]

view_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'exp.php?ID=')]")
for idx, view in enumerate(view_links):
    href = view.get_attribute('href')
    view_links[idx] = href

# For each link get the trip text and add the text and link to a list of dicts
for href in view_links:
    driver.get(href)
    trip_text = driver.find_elements(By.CLASS_NAME, 'report-text-surround')
    for trip in trip_text:
        text = trip.text.encode('utf-8').decode('utf-8')
        print(text)
        data_list.append({'Text': text, 'Link': href})

# Update the dataframe with the collected data
df = pd.DataFrame(data_list)
df["Drug"] = 'LSD'
df.to_csv("LSD_Trip_Reports.csv")

  DOSE:
T+ 0:00 200 ug sublingual LSD (blotter / tab)
  T+ 1:30 1 bowl smoked Cannabis (flowers)
  T+ 2:00 1 glass oral Coffee  
  T+ 0:00   smoked Tobacco - Cigarettes  
  T+ 3:00 2 bowls smoked Cannabis (flowers)
BODY WEIGHT: 190 lb
I had just received my package on Tuesday. I planned on tripping throughout an entire day, getting up in the early hours (5am) to dose one of the two hits I had purchased. I had trouble sleeping and I ended up only getting about 5 hours of sleep because of how excited I was. When I awoke at 5:30 I was more energetic and positive than ever, despite my lack of sleep. I grabbed some coffee, took some vitamin supplements and awaited for 6:00am because I figured it would be better to dose on the hour.

6:00am - Tasteless. Not even the slightest amount of any sort of bitterness or any other tastes. This was my first time ever trying the drug, but after extensive reading I knew this was a good thing. This put a smile on my face, I proceeded to take pictures of t

### Final Version
Works by showing all trip reports and processes the html with Beautiful Soup.

In [64]:
import pandas as pd
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium.common.exceptions import TimeoutException

# Initialize WebDriver
driver = webdriver.Safari()
driver.set_page_load_timeout(15)  # Set timeout limit

BASE_URL = "https://www.erowid.org/experiences/exp.cgi?ShowViews=0&Cellar=0&Start=0&Max=39877"
LINKS_FILE = "erowid_links.txt"
REPORTS_FILE = "Erowid_Trip_Reports.csv"
BATCH_SIZE = 100  # Save data in batches

def get_all_report_links(start_url):
    """Fetch all experience report links and save them to LINKS_FILE."""
    if os.path.exists(LINKS_FILE):
        print("Loading existing report links...")
        with open(LINKS_FILE, "r") as f:
            return list(set(f.read().splitlines()))
    
    driver.get(start_url)
    report_links = []
    
    start_time = time.time()
    print(f"Scraping index page: {driver.current_url}")
    page_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'exp.php?ID=')]")
    report_links.extend([link.get_attribute('href') for link in page_links 
                         if link.get_attribute('href').startswith("https://www.erowid.org")])
    
    seconds = time.time() - start_time
    print(f"<<<<<<<<<< Finished gathering links >>>>>>>>>\nTook {seconds:.2f} seconds.")
    
    # Remove duplicates and save to file
    report_links = list(set(report_links))
    with open(LINKS_FILE, "w") as f:
        f.write("\n".join(report_links))
    
    return report_links

def scrape_erowid_reports(report_links):
    """Scrape trip reports and merge new data into REPORTS_FILE without duplicates."""
    all_data = []
    processed_count = 0

    # Load already scraped links from CSV if available
    if os.path.exists(REPORTS_FILE):
        existing_df = pd.read_csv(REPORTS_FILE)
        scraped_links = set(existing_df["Link"].dropna().tolist())
    else:
        scraped_links = set()

    # Remove duplicates from input list and filter out already scraped links
    report_links = list(set(report_links))
    report_links = [link for link in report_links if link not in scraped_links]
    print(f"Remaining reports to scrape: {len(report_links)}")

    start_time = time.time()

    for i, href in enumerate(tqdm(report_links, desc="Scraping Reports", unit="report")):
        # Safety check to ensure link is not processed twice
        if href in scraped_links:
            continue

        if not href.startswith("https://www.erowid.org"):
            print(f"Skipping invalid link: {href}")
            continue

        try:
            driver.get(href)
        except TimeoutException:
            print(f"Timeout error: Skipping {href}")
            continue
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        if "403 Forbidden: Your IP Address Has Been Blocked" in page_source:
            ip_address = soup.find("h2").text.split(": ")[-1].strip()
            print(f"🚨 IP BANNED: {ip_address} 🚨")
            print(f"Process stopped. Successfully scraped {processed_count} reports.")
            driver.quit()
            break
        
        if "reset.me" in driver.current_url or "wordpress.com" in driver.current_url:
            print(f"🚨 Redirected to external site: {driver.current_url}. Skipping... 🚨")
            continue

        try:
            title_element = soup.find("div", class_="title")
            title = title_element.text.strip() if title_element else "Unknown Title"
            
            substance_element = soup.find("div", class_="substance")
            substance = substance_element.text.strip() if substance_element else "Unknown Substance"
            
            author_element = soup.find("div", class_="author")
            author = author_element.text.replace("by", "").strip() if author_element else "Unknown"
            
            bodyweight_element = soup.find("td", class_="bodyweight-amount")
            bodyweight = bodyweight_element.text.strip() if bodyweight_element else "Unknown"
            
            dose_chart_entries = []
            for row in soup.select("table.dosechart tbody tr"):
                cols = row.find_all("td")
                if len(cols) == 5:
                    dose_chart_entries.append(
                        " | ".join([col.text.strip() for col in cols])
                    )
            dose_chart = "\n".join(dose_chart_entries) if dose_chart_entries else "No Dose Chart Available"
            
            report_text_element = soup.find("div", class_="report-text-surround")
            report_text = "".join(report_text_element.stripped_strings) if report_text_element else "No Text Available"
            
            all_data.append({
                "Title": title,
                "Substance": substance,
                "Author": author,
                "Bodyweight": bodyweight,
                "Dose Chart": dose_chart,
                "Report Text": report_text,
                "Link": href
            })
            processed_count += 1
            scraped_links.add(href)  # Mark as scraped
        except Exception as e:
            print(f"Error scraping {href}: {e}")
            continue

        # Save in batches
        if (i + 1) % BATCH_SIZE == 0 or (i + 1) == len(report_links):
            df_batch = pd.DataFrame(all_data)
            if os.path.exists(REPORTS_FILE):
                existing_df = pd.read_csv(REPORTS_FILE)
                combined_df = pd.concat([existing_df, df_batch], ignore_index=True)
                combined_df.drop_duplicates(subset="Link", inplace=True)
                combined_df.to_csv(REPORTS_FILE, index=False)
            else:
                df_batch.drop_duplicates(subset="Link", inplace=True)
                df_batch.to_csv(REPORTS_FILE, index=False)
            all_data.clear()
            print(f"Saved batch {(i + 1)} to {REPORTS_FILE}")

    seconds_total = time.time() - start_time
    print(f"Finished scraping {processed_count} reports in {seconds_total:.2f} seconds.")
    driver.quit()

def remove_duplicates_from_csv():
    """Clean the CSV file by removing any duplicate entries based on the Link column."""
    if os.path.exists(REPORTS_FILE):
        df = pd.read_csv(REPORTS_FILE)
        before = len(df)
        df.drop_duplicates(subset="Link", inplace=True)
        df.to_csv(REPORTS_FILE, index=False)
        after = len(df)
        print(f"Removed {before - after} duplicate entries from {REPORTS_FILE}.")

# Main execution steps
report_links = get_all_report_links(BASE_URL)
scrape_erowid_reports(report_links)
remove_duplicates_from_csv()

Loading existing report links...
Remaining reports to scrape: 35061


Scraping Reports:   0%|          | 1/35061 [00:04<42:37:04,  4.38s/report]

🚨 Redirected to external site: https://shivahaoma.wordpress.com/2012/02/27/mahashiva-ratri-night-of-the-great-shiva-ii/. Skipping... 🚨


Scraping Reports:   0%|          | 2/35061 [00:04<19:59:57,  2.05s/report]

🚨 Redirected to external site: https://reset.me/personal-story/how-i-completely-healed-my-panic-attacks-with-psilocybin-mushrooms/. Skipping... 🚨


Scraping Reports:   0%|          | 100/35061 [00:39<6:29:24,  1.50report/s]

Saved batch 100 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 201/35061 [01:13<3:56:52,  2.45report/s] 

Saved batch 200 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 301/35061 [01:37<3:14:41,  2.98report/s]

Saved batch 300 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 358/35061 [02:03<44:27:33,  4.61s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=27940


Scraping Reports:   1%|          | 401/35061 [02:15<3:07:49,  3.08report/s] 

Saved batch 400 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 435/35061 [02:37<45:04:56,  4.69s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=58371


Scraping Reports:   1%|▏         | 501/35061 [02:57<10:19:37,  1.08s/report]

Saved batch 500 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 601/35061 [03:19<3:39:14,  2.62report/s] 

Saved batch 600 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 701/35061 [03:45<3:43:19,  2.56report/s]

Saved batch 700 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 801/35061 [04:14<3:56:45,  2.41report/s] 

Saved batch 800 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 837/35061 [04:37<43:49:59,  4.61s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=9362


Scraping Reports:   3%|▎         | 901/35061 [04:59<5:19:22,  1.78report/s] 

Saved batch 900 to Erowid_Trip_Reports.csv


Scraping Reports:   3%|▎         | 1001/35061 [05:22<3:33:38,  2.66report/s]

Saved batch 1000 to Erowid_Trip_Reports.csv


Scraping Reports:   3%|▎         | 1020/35061 [05:26<3:01:37,  3.12report/s]


KeyboardInterrupt: 

Version with IP switching

In [69]:
import pandas as pd
import time
import os
import subprocess  # For running NordVPN CLI commands
import requests    # To fetch the current IP
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium.common.exceptions import TimeoutException

# Initialize WebDriver
driver = webdriver.Safari()
driver.set_page_load_timeout(15)  # Set timeout limit

BASE_URL = "https://www.erowid.org/experiences/exp.cgi?ShowViews=0&Cellar=0&Start=0&Max=39877"
LINKS_FILE = "erowid_links.txt"
REPORTS_FILE = "Erowid_Trip_Reports.csv"
BATCH_SIZE = 100  # Save data in batches

def get_current_ip():
    """
    Fetches and prints the current public IP address.
    """
    try:
        ip = requests.get("https://api.ipify.org").text
        print("Current IP:", ip)
        return ip
    except Exception as e:
        print("Error fetching current IP:", e)
        return None

def change_ip_via_vpn(server='us'):
    """
    Switch IP by disconnecting and reconnecting using NordVPN CLI.
    Make sure NordVPN CLI is installed and you're logged in.
    """
    print("Current IP before switching:")
    get_current_ip()
    print("Switching VPN connection...")
    subprocess.call(["nordvpn", "disconnect"])
    time.sleep(2)  # Short pause after disconnect
    subprocess.call(["nordvpn", "connect", server])
    time.sleep(10)  # Wait for the VPN connection to take effect
    print("VPN switched. New IP should be in effect:")
    get_current_ip()

def get_all_report_links(start_url):
    """Fetch all experience report links and save them to LINKS_FILE."""
    if os.path.exists(LINKS_FILE):
        print("Loading existing report links...")
        with open(LINKS_FILE, "r") as f:
            return list(set(f.read().splitlines()))
    
    driver.get(start_url)
    report_links = []
    
    start_time = time.time()
    print(f"Scraping index page: {driver.current_url}")
    page_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'exp.php?ID=')]")
    report_links.extend([link.get_attribute('href') for link in page_links 
                         if link.get_attribute('href').startswith("https://www.erowid.org")])
    
    seconds = time.time() - start_time
    print(f"<<<<<<<<<< Finished gathering links >>>>>>>>>\nTook {seconds:.2f} seconds.")
    
    # Remove duplicates and save to file
    report_links = list(set(report_links))
    with open(LINKS_FILE, "w") as f:
        f.write("\n".join(report_links))
    
    return report_links

def scrape_erowid_reports(report_links):
    """Scrape trip reports and merge new data into REPORTS_FILE without duplicates."""
    global driver  # So we can reinitialize driver after VPN switching
    all_data = []
    processed_count = 0

    # Load already scraped links from CSV if available
    if os.path.exists(REPORTS_FILE):
        existing_df = pd.read_csv(REPORTS_FILE)
        scraped_links = set(existing_df["Link"].dropna().tolist())
    else:
        scraped_links = set()

    # Remove duplicates from input list and filter out already scraped links
    report_links = list(set(report_links))
    report_links = [link for link in report_links if link not in scraped_links]
    print(f"Remaining reports to scrape: {len(report_links)}")

    start_time = time.time()

    for i, href in enumerate(tqdm(report_links, desc="Scraping Reports", unit="report")):
        # Safety check to ensure link is not processed twice
        if href in scraped_links:
            continue

        if not href.startswith("https://www.erowid.org"):
            print(f"Skipping invalid link: {href}")
            continue

        # Attempt to load the page, with VPN-switching if IP is blocked
        max_attempts = 3
        attempts = 0
        page_loaded = False
        while attempts < max_attempts and not page_loaded:
            try:
                driver.get(href)
                page_source = driver.page_source
                if "403 Forbidden: Your IP Address Has Been Blocked" in page_source:
                    print(f"🚨 IP BLOCKED on {href}. Attempt {attempts+1} of {max_attempts}.")
                    change_ip_via_vpn('us')
                    driver.quit()
                    driver = webdriver.Safari()
                    driver.set_page_load_timeout(15)
                    attempts += 1
                else:
                    page_loaded = True  # Page loaded successfully
            except TimeoutException:
                print(f"Timeout error: Skipping {href}")
                break

        if not page_loaded:
            print(f"Skipping {href} after {max_attempts} attempts.")
            continue

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        if "reset.me" in driver.current_url or "wordpress.com" in driver.current_url:
            print(f"🚨 Redirected to external site: {driver.current_url}. Skipping... 🚨")
            continue

        try:
            title_element = soup.find("div", class_="title")
            title = title_element.text.strip() if title_element else "Unknown Title"
            
            substance_element = soup.find("div", class_="substance")
            substance = substance_element.text.strip() if substance_element else "Unknown Substance"
            
            author_element = soup.find("div", class_="author")
            author = author_element.text.replace("by", "").strip() if author_element else "Unknown"
            
            bodyweight_element = soup.find("td", class_="bodyweight-amount")
            bodyweight = bodyweight_element.text.strip() if bodyweight_element else "Unknown"
            
            dose_chart_entries = []
            for row in soup.select("table.dosechart tbody tr"):
                cols = row.find_all("td")
                if len(cols) == 5:
                    dose_chart_entries.append(
                        " | ".join([col.text.strip() for col in cols])
                    )
            dose_chart = "\n".join(dose_chart_entries) if dose_chart_entries else "No Dose Chart Available"
            
            report_text_element = soup.find("div", class_="report-text-surround")
            report_text = "".join(report_text_element.stripped_strings) if report_text_element else "No Text Available"
            
            all_data.append({
                "Title": title,
                "Substance": substance,
                "Author": author,
                "Bodyweight": bodyweight,
                "Dose Chart": dose_chart,
                "Report Text": report_text,
                "Link": href
            })
            processed_count += 1
            scraped_links.add(href)  # Mark as scraped
        except Exception as e:
            print(f"Error scraping {href}: {e}")
            continue

        # Save in batches
        if (i + 1) % BATCH_SIZE == 0 or (i + 1) == len(report_links):
            df_batch = pd.DataFrame(all_data)
            if os.path.exists(REPORTS_FILE):
                existing_df = pd.read_csv(REPORTS_FILE)
                combined_df = pd.concat([existing_df, df_batch], ignore_index=True)
                combined_df.drop_duplicates(subset="Link", inplace=True)
                combined_df.to_csv(REPORTS_FILE, index=False)
            else:
                df_batch.drop_duplicates(subset="Link", inplace=True)
                df_batch.to_csv(REPORTS_FILE, index=False)
            all_data.clear()
            print(f"Saved batch {(i + 1)} to {REPORTS_FILE}")

    seconds_total = time.time() - start_time
    print(f"Finished scraping {processed_count} reports in {seconds_total:.2f} seconds.")
    driver.quit()

def remove_duplicates_from_csv():
    """Clean the CSV file by removing any duplicate entries based on the Link column."""
    if os.path.exists(REPORTS_FILE):
        df = pd.read_csv(REPORTS_FILE)
        before = len(df)
        df.drop_duplicates(subset="Link", inplace=True)
        df.to_csv(REPORTS_FILE, index=False)
        after = len(df)
        print(f"Removed {before - after} duplicate entries from {REPORTS_FILE}.")

# Main execution steps
print("Starting Erowid scraper. Initial IP:")
get_current_ip()
report_links = get_all_report_links(BASE_URL)
scrape_erowid_reports(report_links)
remove_duplicates_from_csv()

Starting Erowid scraper. Initial IP:
Current IP: 185.207.249.100
Loading existing report links...
Remaining reports to scrape: 33776


Scraping Reports:   0%|          | 1/33776 [00:12<119:01:28, 12.69s/report]

🚨 Redirected to external site: https://shivahaoma.wordpress.com/2012/02/27/mahashiva-ratri-night-of-the-great-shiva-ii/. Skipping... 🚨


Scraping Reports:   0%|          | 2/33776 [00:13<52:49:29,  5.63s/report] 

🚨 Redirected to external site: https://reset.me/personal-story/how-i-completely-healed-my-panic-attacks-with-psilocybin-mushrooms/. Skipping... 🚨


Scraping Reports:   0%|          | 3/33776 [00:28<93:00:22,  9.91s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=27940
Skipping https://www.erowid.org/experiences/exp.php?ID=27940 after 3 attempts.


Scraping Reports:   0%|          | 4/33776 [00:43<111:52:47, 11.93s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=58371
Skipping https://www.erowid.org/experiences/exp.php?ID=58371 after 3 attempts.


Scraping Reports:   0%|          | 5/33776 [00:58<122:18:40, 13.04s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=9362
Skipping https://www.erowid.org/experiences/exp.php?ID=9362 after 3 attempts.


Scraping Reports:   0%|          | 100/33776 [01:39<8:16:21,  1.13report/s]

Saved batch 100 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 201/33776 [02:16<5:17:52,  1.76report/s] 

Saved batch 200 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 301/33776 [02:43<3:54:23,  2.38report/s]

Saved batch 300 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|          | 401/33776 [03:01<3:47:01,  2.45report/s]

Saved batch 400 to Erowid_Trip_Reports.csv


Scraping Reports:   1%|▏         | 501/33776 [03:26<3:38:53,  2.53report/s] 

Saved batch 500 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 601/33776 [03:46<3:59:43,  2.31report/s]

Saved batch 600 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 700/33776 [04:04<5:04:18,  1.81report/s]

Saved batch 700 to Erowid_Trip_Reports.csv


Scraping Reports:   2%|▏         | 800/33776 [04:29<5:12:30,  1.76report/s] 

Saved batch 800 to Erowid_Trip_Reports.csv


Scraping Reports:   3%|▎         | 901/33776 [04:53<4:21:42,  2.09report/s]

Saved batch 900 to Erowid_Trip_Reports.csv


Scraping Reports:   3%|▎         | 1001/33776 [05:18<4:17:49,  2.12report/s]

Saved batch 1000 to Erowid_Trip_Reports.csv


Scraping Reports:   3%|▎         | 1019/33776 [05:38<42:03:08,  4.62s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=15012
Skipping https://www.erowid.org/experiences/exp.php?ID=15012 after 3 attempts.


Scraping Reports:   3%|▎         | 1101/33776 [05:54<3:49:14,  2.38report/s] 

Saved batch 1100 to Erowid_Trip_Reports.csv


Scraping Reports:   4%|▎         | 1201/33776 [06:18<3:53:06,  2.33report/s] 

Saved batch 1200 to Erowid_Trip_Reports.csv


Scraping Reports:   4%|▍         | 1301/33776 [06:36<3:41:20,  2.45report/s]

Saved batch 1300 to Erowid_Trip_Reports.csv


Scraping Reports:   4%|▍         | 1401/33776 [06:57<3:49:42,  2.35report/s]

Saved batch 1400 to Erowid_Trip_Reports.csv


Scraping Reports:   4%|▍         | 1500/33776 [07:37<41:27:59,  4.63s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=17851
Skipping https://www.erowid.org/experiences/exp.php?ID=17851 after 3 attempts.


Scraping Reports:   5%|▍         | 1601/33776 [07:58<4:19:37,  2.07report/s] 

Saved batch 1600 to Erowid_Trip_Reports.csv


Scraping Reports:   5%|▌         | 1701/33776 [08:23<3:56:59,  2.26report/s] 

Saved batch 1700 to Erowid_Trip_Reports.csv


Scraping Reports:   5%|▌         | 1801/33776 [08:42<4:12:35,  2.11report/s]

Saved batch 1800 to Erowid_Trip_Reports.csv


Scraping Reports:   6%|▌         | 1901/33776 [09:01<3:58:42,  2.23report/s]

Saved batch 1900 to Erowid_Trip_Reports.csv


Scraping Reports:   6%|▌         | 2001/33776 [09:31<4:39:39,  1.89report/s] 

Saved batch 2000 to Erowid_Trip_Reports.csv


Scraping Reports:   6%|▌         | 2101/33776 [09:51<3:57:18,  2.22report/s]

Saved batch 2100 to Erowid_Trip_Reports.csv


Scraping Reports:   7%|▋         | 2201/33776 [10:18<3:56:04,  2.23report/s] 

Saved batch 2200 to Erowid_Trip_Reports.csv


Scraping Reports:   7%|▋         | 2301/33776 [10:37<3:54:37,  2.24report/s]

Saved batch 2300 to Erowid_Trip_Reports.csv


Scraping Reports:   7%|▋         | 2401/33776 [10:55<4:04:51,  2.14report/s]

Saved batch 2400 to Erowid_Trip_Reports.csv


Scraping Reports:   7%|▋         | 2501/33776 [11:22<3:58:38,  2.18report/s] 

Saved batch 2500 to Erowid_Trip_Reports.csv


Scraping Reports:   8%|▊         | 2601/33776 [11:41<4:06:35,  2.11report/s]

Saved batch 2600 to Erowid_Trip_Reports.csv


Scraping Reports:   8%|▊         | 2701/33776 [12:00<4:20:08,  1.99report/s]

Saved batch 2700 to Erowid_Trip_Reports.csv


Scraping Reports:   8%|▊         | 2801/33776 [12:28<4:05:08,  2.11report/s] 

Saved batch 2800 to Erowid_Trip_Reports.csv


Scraping Reports:   9%|▊         | 2901/33776 [12:51<4:26:36,  1.93report/s]

Saved batch 2900 to Erowid_Trip_Reports.csv


Scraping Reports:   9%|▉         | 3001/33776 [13:08<4:12:49,  2.03report/s]

Saved batch 3000 to Erowid_Trip_Reports.csv


Scraping Reports:   9%|▉         | 3101/33776 [13:34<4:06:39,  2.07report/s] 

Saved batch 3100 to Erowid_Trip_Reports.csv


Scraping Reports:   9%|▉         | 3201/33776 [13:56<4:06:58,  2.06report/s]

Saved batch 3200 to Erowid_Trip_Reports.csv


Scraping Reports:  10%|▉         | 3301/33776 [14:21<4:46:43,  1.77report/s] 

Saved batch 3300 to Erowid_Trip_Reports.csv


Scraping Reports:  10%|█         | 3400/33776 [14:39<5:37:17,  1.50report/s]

Saved batch 3400 to Erowid_Trip_Reports.csv


Scraping Reports:  10%|█         | 3501/33776 [14:59<4:08:53,  2.03report/s]

Saved batch 3500 to Erowid_Trip_Reports.csv


Scraping Reports:  11%|█         | 3600/33776 [15:25<5:23:00,  1.56report/s] 

Saved batch 3600 to Erowid_Trip_Reports.csv


Scraping Reports:  11%|█         | 3701/33776 [15:43<4:27:00,  1.88report/s]

Saved batch 3700 to Erowid_Trip_Reports.csv


Scraping Reports:  11%|█▏        | 3800/33776 [16:05<6:09:25,  1.35report/s]

Saved batch 3800 to Erowid_Trip_Reports.csv


Scraping Reports:  12%|█▏        | 3901/33776 [16:42<4:25:03,  1.88report/s] 

Saved batch 3900 to Erowid_Trip_Reports.csv


Scraping Reports:  12%|█▏        | 3957/33776 [17:13<38:15:26,  4.62s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=35118
Skipping https://www.erowid.org/experiences/exp.php?ID=35118 after 3 attempts.


Scraping Reports:  12%|█▏        | 4001/33776 [17:31<4:18:31,  1.92report/s] 

Saved batch 4000 to Erowid_Trip_Reports.csv


Scraping Reports:  12%|█▏        | 4008/33776 [17:47<39:18:57,  4.75s/report]

Timeout error: Skipping https://www.erowid.org/experiences/exp.php?ID=24551
Skipping https://www.erowid.org/experiences/exp.php?ID=24551 after 3 attempts.


Scraping Reports:  12%|█▏        | 4101/33776 [18:04<4:28:35,  1.84report/s] 

Saved batch 4100 to Erowid_Trip_Reports.csv


Scraping Reports:  12%|█▏        | 4201/33776 [18:30<4:38:54,  1.77report/s] 

Saved batch 4200 to Erowid_Trip_Reports.csv


Scraping Reports:  13%|█▎        | 4301/33776 [18:53<4:19:01,  1.90report/s]

Saved batch 4300 to Erowid_Trip_Reports.csv


Scraping Reports:  13%|█▎        | 4401/33776 [19:12<4:27:25,  1.83report/s]

Saved batch 4400 to Erowid_Trip_Reports.csv


Scraping Reports:  13%|█▎        | 4448/33776 [19:29<2:08:32,  3.80report/s] 

🚨 IP BLOCKED on https://www.erowid.org/experiences/exp.php?ID=87669. Attempt 1 of 3.
Current IP before switching:
Current IP: 185.207.249.100
Switching VPN connection...





FileNotFoundError: [Errno 2] No such file or directory: 'nordvpn'

### Clean up any failed Reports

In [None]:
# remove no text avaliable
trip_reports = pd.read_csv("Erowid_Trip_Reports.csv")
# Filter out Failed Extractions
trip_reports = trip_reports[trip_reports["Report Text"] != "No Text Available"]
trip_reports = trip_reports.drop_duplicates()
# trip_reports.to_csv("Erowid_Trip_Reports.csv")


10494