In [1]:
import pandas as pd
import requests

In [None]:
 
response = requests.get(
  "https://proxy.webshare.io/api/v2/proxy/list/?page=4&page_size=10",
  headers={"Authorization": "Token MY_TOKEN"},
  params={"mode": "direct", "page": 1}
)
 
response.json()

In [None]:
import requests
import csv
import time
from tqdm import tqdm
import concurrent.futures

base_url = "https://zinc.docking.org"
endpoint = "/catalogs/chembl30/substances.json"
csv_file = "zinc_data_2.csv"

# Define a function to make a request using a proxy
def fetch_data(page, proxy):
    proxies = {
        # "http": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['port']}",
        "https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['port']}",
    }
    params = {
        'count': 1000,
        'page': page
    }
    try:
        response = requests.get(base_url + endpoint, params=params, proxies=proxies, timeout=30)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error on page {page} with proxy {proxy['proxy_address']}: {response.status_code}")
    except Exception as e:
        print(f"Error on page {page} with proxy {proxy['proxy_address']}: {e}")
    return None

# Fetch proxy list
response = requests.get(
    "https://proxy.webshare.io/api/v2/proxy/list/?page=4&page_size=10",
    headers={"Authorization": "Token MY_TOKEN"},
    params={"mode": "direct", "page": 1}
)
proxies = response.json()['results']

# Open the CSV file in append mode
with open(csv_file, 'a', newline='') as f:
    writer = csv.writer(f)
    # # Write header only once if you're starting fresh
    # writer.writerow(["zinc_id", "smiles"])

    # Create a ThreadPool and make requests
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Prepare list of pages and proxies for each request
        num_proxies = len(proxies)
        args = [(page, proxies[i % num_proxies]) for i, page in enumerate(range(1, 2001))]

        for substances in tqdm(executor.map(lambda arg: fetch_data(*arg), args), total=len(args)):
            if substances:
                for substance in substances:
                    writer.writerow([substance["zinc_id"], substance["smiles"]])
            time.sleep(1)  # To avoid making too many requests in a short amount of time

print("Data fetching completed!")


In [None]:
# Parse the file to extract proxy details
def parse_proxy_line(line):
    parts = line.split(':')
    return {
        'proxy_address': parts[0],
        'port': parts[1],
        'username': parts[2],
        'password': parts[3]
    }

with open("Webshare 50 proxies.txt", 'r') as f:
    proxies = [parse_proxy_line(line.strip()) for line in f]

proxies[:5]  

In [None]:
import requests
import csv
import time
from tqdm import tqdm
import concurrent.futures
import random

base_url = "https://zinc.docking.org"
endpoint = "/catalogs/chembl30/substances.json"
csv_file = "zinc_data_2.csv"

def fetch_data(page, proxy, proxies, max_retries=3, used_proxies=None):
    if used_proxies is None:
        used_proxies = set()

    proxies_dict = {
        "https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['port']}",
    }
    params = {
        'count': 1000,
        'page': page
    }
    try:
        response = requests.get(base_url + endpoint, params=params, proxies=proxies_dict, timeout=30)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error on page {page} with proxy {proxy['proxy_address']}: {response.status_code}")
            if max_retries > 0:
                new_proxy = random.choice([p for p in proxies if p['proxy_address'] not in used_proxies])
                used_proxies.add(new_proxy['proxy_address'])
                return fetch_data(page, new_proxy, proxies, max_retries - 1, used_proxies)
    except Exception as e:
        print(f"Error on page {page} with proxy {proxy['proxy_address']}: {e}")
        if max_retries > 0:
            new_proxy = random.choice([p for p in proxies if p['proxy_address'] not in used_proxies])
            used_proxies.add(new_proxy['proxy_address'])
            return fetch_data(page, new_proxy, proxies, max_retries - 1, used_proxies)
    return None



with open(csv_file, 'a', newline='') as f:
    writer = csv.writer(f)
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        args = [(page, proxies[i % len(proxies)], proxies) for i, page in enumerate(range(148, 2001))]
        for substances in tqdm(executor.map(lambda arg: fetch_data(*arg), args), total=len(args)):
            if substances:
                for substance in substances:
                    writer.writerow([substance["zinc_id"], substance["smiles"]])
            time.sleep(1)  # To avoid making too many requests in a short amount of time

print("Data fetching completed!")
