In [2]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time

def count_string_in_url(url, target_string, timeout=60):
    try:
        # Send an HTTP GET request to the URL with a timeout
        response = requests.get(url, timeout=timeout)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the content of the web page
            page_content = response.text
            
            # Count the occurrences of the target string
            count = page_content.count(target_string)
            
            return count
        elif response.status_code == 403:
            print(f"URL '{url}' returned a '403 Forbidden' status code. Skipping.")
            return 0  # Return 0 to indicate that the URL was processed but no occurrences found
        else:
            print(f"Failed to fetch the URL '{url}'. Status code: {response.status_code}")
            return -1  # Return -1 to indicate an error
    except requests.Timeout:
        print(f"Timeout occurred while processing the URL '{url}'. Skipping.")
        return -1  # Return -1 to indicate a timeout error
    except Exception as e:
        print(f"An error occurred while processing the URL '{url}': {str(e)}")
        return -1  # Return -1 to indicate an error

# Path to the CSV file containing URLs (replace with your input CSV file path)
input_csv_file_path = "/Users/rentsher/Desktop/test_file.csv"

# Read the CSV file into a DataFrame
try:
    df = pd.read_csv(input_csv_file_path)
except Exception as e:
    print(f"An error occurred while reading the CSV file: {str(e)}")
    df = pd.DataFrame()

if not df.empty:
    target_string = "https://i.vimeo"
    
    # Create a list to store the results
    results = []

    # Replace "YourColumnName" with the actual column name containing the URLs
    urls = df["Domain"]

    # Create a ThreadPoolExecutor for concurrent processing
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        
        # Wrap the loop with tqdm for progress monitoring
        for url in tqdm(urls, desc="Processing URLs"):
            start_time = time.time()
            future = executor.submit(count_string_in_url, url, target_string)
            futures.append((url, future))
        
        for url, future in futures:
            count = future.result()
            if count is not None:
                result_entry = {
                    "URL": url,
                    "Count": count
                }
                results.append(result_entry)

    # Create a DataFrame from the results
    df_results = pd.DataFrame(results)

    # Path to the output CSV file (replace with your desired output file path)
    output_csv_file_path = "/Users/rentsher/Desktop/output_results_with_counts_1.csv"

    # Save the DataFrame with results to the output CSV file
    df_results.to_csv(output_csv_file_path, index=False)

    print(f"Results saved to '{output_csv_file_path}'.")
else:
    print("Empty DataFrame. No processing or saving was performed.")


Processing URLs: 100%|██████████████████████| 802/802 [00:00<00:00, 4364.18it/s]


URL 'https://www.clearedjobs.net' returned a '403 Forbidden' status code. Skipping.
Failed to fetch the URL 'https://www.diamondchallenge.org'. Status code: 406
URL 'https://www.aaregistry.org' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.acstechnologies.com' returned a '403 Forbidden' status code. Skipping.
An error occurred while processing the URL 'https://www.naacpldf.org': HTTPSConnectionPool(host='www.naacpldf.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
URL 'https://www.werner.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.art21.org' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.callnorthwest.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.hidglobal.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.

URL 'https://www.fastersolutions.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.jirikdds.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.grandviewlending.com' returned a '403 Forbidden' status code. Skipping.
An error occurred while processing the URL 'https://www.lacornueusa.com': ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
URL 'https://www.atlantajewelryshow.com' returned a '403 Forbidden' status code. Skipping.
Failed to fetch the URL 'https://www.atiwa.com'. Status code: 503
Failed to fetch the URL 'https://www.awlcs.org'. Status code: 503
Failed to fetch the URL 'https://www.medcgroup.com'. Status code: 503
URL 'https://www.fortune-auto.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.insearchofliberty.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.owens-minor.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.branso

URL 'https://www.cityvistaapts.com' returned a '403 Forbidden' status code. Skipping.
Failed to fetch the URL 'https://www.impactreptheatre.org'. Status code: 429
Failed to fetch the URL 'https://www.accessalaska.org'. Status code: 429
Timeout occurred while processing the URL 'https://www.imagedermatology.com'. Skipping.
URL 'https://www.blastec.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.elanvillagecourt.com' returned a '403 Forbidden' status code. Skipping.URL 'https://www.elanscrippsterrace.com' returned a '403 Forbidden' status code. Skipping.

URL 'https://www.elansolanapointe.com' returned a '403 Forbidden' status code. Skipping.
URL 'https://www.interservlp.com' returned a '403 Forbidden' status code. Skipping.
Timeout occurred while processing the URL 'https://www.salon-concepts.com'. Skipping.
Results saved to '/Users/rentsher/Desktop/output_results_with_counts_1.csv'.


In [12]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time

def count_string_in_url(url, target_string, uppercase=False, timeout=60):
    try:
        # Send an HTTP GET request to the URL with a timeout
        response = requests.get(url, timeout=timeout)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the content of the web page
            page_content = response.text
            
            # Optionally capitalize or uppercase the HTML content
            if uppercase:
                page_content = page_content.upper()
            
            # Count the occurrences of the target string
            count = page_content.count(target_string.upper())
            
            return count
        elif response.status_code == 403:
            print(f"URL '{url}' returned a '403 Forbidden' status code. Skipping.")
            return 0  # Return 0 to indicate that the URL was processed but no occurrences found
        else:
            print(f"Failed to fetch the URL '{url}'. Status code: {response.status_code}")
            return -1  # Return -1 to indicate an error
    except requests.Timeout:
        print(f"Timeout occurred while processing the URL '{url}'. Skipping.")
        return -1  # Return -1 to indicate a timeout error
    except Exception as e:
        print(f"An error occurred while processing the URL '{url}': {str(e)}")
        return -1  # Return -1 to indicate an error

# Path to the CSV file containing URLs (replace with your input CSV file path)
input_csv_file_path = "/Users/rentsher/Desktop/test_file.csv"  # Replace with the actual input path

# Read the CSV file into a DataFrame
try:
    df = pd.read_csv(input_csv_file_path)
except Exception as e:
    print(f"An error occurred while reading the CSV file: {str(e)}")
    df = pd.DataFrame()

if not df.empty:
    target_string = "Imagekit.io"
    
    # Create a list to store the results
    results = []

    # Replace "YourColumnName" with the actual column name containing the URLs
    urls = df["Domain"]

    # Configure tqdm with the total number of URLs for accurate progress tracking
    total_urls = len(urls)
    with tqdm(total=total_urls, desc="Processing URLs") as pbar:
        # Create a ThreadPoolExecutor for concurrent processing
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            
            # Wrap the loop for progress monitoring
            for url in urls:
                start_time = time.time()
                future = executor.submit(count_string_in_url, url, target_string, uppercase=True)
                future.add_done_callback(lambda x: pbar.update(1))  # Update progress bar
                futures.append((url, future))
            
            for url, future in futures:
                count = future.result()
                if count is not None:
                    result_entry = {
                        "URL": url,
                        "Count": count
                    }
                    results.append(result_entry)

    # Create a DataFrame from the results
    df_results = pd.DataFrame(results)

    # Path to the output CSV file (replace with your desired output file path)
    output_csv_file_path = "/Users/rentsher/Desktop/output_results_with_counts_4.csv"  # Replace with the actual output path

    # Save the DataFrame with results to the output CSV file
    df_results.to_csv(output_csv_file_path, index=False)

    print(f"Results saved to '{output_csv_file_path}'.")
else:
    print("Empty DataFrame. No processing or saving was performed.")


Processing URLs:   1%|▏                         | 2/254 [00:00<00:57,  4.38it/s]

URL 'https://www.vonage.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:   4%|▉                        | 10/254 [00:03<01:49,  2.23it/s]

URL 'https://www.springct.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:   7%|█▋                       | 17/254 [00:04<00:47,  5.04it/s]

An error occurred while processing the URL 'https://www.knowlarity.com': HTTPSConnectionPool(host='www.knowlarity.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))


Processing URLs:  10%|██▌                      | 26/254 [00:06<00:27,  8.16it/s]

An error occurred while processing the URL 'https://www.hentland.com': HTTPSConnectionPool(host='www.hentland.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fe98a838ee0>: Failed to establish a new connection: [Errno 61] Connection refused'))


Processing URLs:  11%|██▊                      | 29/254 [00:07<00:35,  6.39it/s]

Failed to fetch the URL 'https://www.pentagon.co.in'. Status code: 500


Processing URLs:  13%|███▏                     | 33/254 [00:08<00:42,  5.14it/s]

URL 'https://www.kumarassociates.co.uk' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  15%|███▋                     | 37/254 [00:09<01:14,  2.91it/s]

Failed to fetch the URL 'https://www.theapplabb.com'. Status code: 406


Processing URLs:  21%|█████▏                   | 53/254 [00:14<00:32,  6.13it/s]

An error occurred while processing the URL 'https://www.www.nic.in': HTTPSConnectionPool(host='www.www.nic.in', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fe98a83f670>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


Processing URLs:  22%|█████▌                   | 56/254 [00:14<00:43,  4.60it/s]

An error occurred while processing the URL 'https://www.visionetsystems.com': HTTPSConnectionPool(host='www.visionetsystems.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.visionetsystems.com' doesn't match either of '*.sucuri.net', 'sucuri.net'")))


Processing URLs:  25%|██████▏                  | 63/254 [00:16<00:53,  3.58it/s]

An error occurred while processing the URL 'https://www.dvois.com': HTTPSConnectionPool(host='www.dvois.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))


Processing URLs:  26%|██████▌                  | 67/254 [00:17<00:39,  4.77it/s]

URL 'https://www.skyboxsecurity.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  30%|███████▍                 | 76/254 [00:19<00:27,  6.46it/s]

An error occurred while processing the URL 'https://www.vivobio.com': HTTPSConnectionPool(host='www.vivobio.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129)')))


Processing URLs:  31%|███████▊                 | 79/254 [00:19<00:35,  4.92it/s]

URL 'https://www.servicenow.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  33%|████████▎                | 84/254 [00:22<01:18,  2.16it/s]

Failed to fetch the URL 'https://www.predoole.com'. Status code: 406


Processing URLs:  37%|█████████▎               | 94/254 [00:24<00:41,  3.86it/s]

Failed to fetch the URL 'https://www.technoexponent.com'. Status code: 406


Processing URLs:  39%|█████████▋               | 98/254 [00:27<01:29,  1.73it/s]

URL 'https://www.rafter.one' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  40%|█████████▌              | 101/254 [00:28<01:33,  1.64it/s]

URL 'https://www.prolifics.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  42%|██████████              | 106/254 [00:30<00:45,  3.25it/s]

URL 'https://www.teikametrics.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  46%|███████████▏            | 118/254 [00:35<00:41,  3.31it/s]

URL 'https://www.mindtree.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  49%|███████████▋            | 124/254 [00:41<02:02,  1.06it/s]

URL 'https://www.hypermine.in' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  50%|████████████            | 127/254 [00:43<01:48,  1.17it/s]

An error occurred while processing the URL 'https://www.tachouse.com': HTTPSConnectionPool(host='www.tachouse.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.tachouse.com' doesn't match either of '*.specialservers.com', 'specialservers.com'")))


Processing URLs:  53%|████████████▊           | 135/254 [00:47<00:39,  2.98it/s]

An error occurred while processing the URL 'https://www.nitie.ac.in': HTTPSConnectionPool(host='www.nitie.ac.in', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))


Processing URLs:  55%|█████████████▏          | 139/254 [00:52<01:44,  1.10it/s]

Failed to fetch the URL 'https://www.capricot.com'. Status code: 406


Processing URLs:  56%|█████████████▌          | 143/254 [00:54<00:53,  2.08it/s]

An error occurred while processing the URL 'https://www.primefocus.com': HTTPSConnectionPool(host='www.primefocus.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))


Processing URLs:  60%|██████████████▎         | 152/254 [00:59<00:53,  1.92it/s]

URL 'https://www.investisdigital.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  61%|██████████████▌         | 154/254 [01:00<00:50,  2.00it/s]

An error occurred while processing the URL 'https://www.strattoncommunications.com': HTTPSConnectionPool(host='www.strattoncommunications.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.strattoncommunications.com' doesn't match either of '*.webhostbox.net', 'webhostbox.net'")))


Processing URLs:  61%|██████████████▋         | 156/254 [01:01<00:33,  2.96it/s]

URL 'https://www.monsterindia.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  62%|██████████████▉         | 158/254 [01:01<00:27,  3.45it/s]

An error occurred while processing the URL 'https://www.msit.edu.in': HTTPSConnectionPool(host='www.msit.edu.in', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.msit.edu.in' doesn't match either of 'technoindiagroup.in', 'careers.technoindiagroup.in', 'msit.edu.in', 'satyamrc.com', 'satyamroychowdhuryfoundation.com', 'technoglobalhospital.com', 'technojunior.in', 'theretreatdarjeeling.com', 'theretreatkolkata.com', 'thotshop.in', 'tib.edu.in'")))


Processing URLs:  66%|███████████████▊        | 168/254 [01:06<00:48,  1.77it/s]

Timeout occurred while processing the URL 'https://www.hpe.com'. Skipping.


Processing URLs:  68%|████████████████▎       | 173/254 [01:08<00:36,  2.19it/s]

An error occurred while processing the URL 'https://www.fxschool.in': HTTPSConnectionPool(host='www.fxschool.in', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129)')))


Processing URLs:  70%|████████████████▊       | 178/254 [01:11<00:28,  2.67it/s]

Failed to fetch the URL 'https://www.knitpro.co.in'. Status code: 404


Processing URLs:  71%|█████████████████       | 180/254 [01:12<00:26,  2.76it/s]

Failed to fetch the URL 'https://www.taxila.in'. Status code: 406



Processing URLs:  72%|█████████████████▎      | 183/254 [01:12<00:17,  4.11it/s]

URL 'https://www.whatfix.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  74%|█████████████████▋      | 187/254 [01:15<00:31,  2.11it/s]

URL 'https://www.emids.com' returned a '403 Forbidden' status code. Skipping.



Processing URLs:  74%|█████████████████▊      | 188/254 [01:16<00:37,  1.77it/s]

Failed to fetch the URL 'https://www.bravensinc.com'. Status code: 406


Processing URLs:  75%|██████████████████      | 191/254 [01:17<00:27,  2.31it/s]

An error occurred while processing the URL 'https://www.infino.co': HTTPSConnectionPool(host='www.infino.co', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)')))


Processing URLs:  78%|██████████████████▋     | 198/254 [01:19<00:16,  3.46it/s]

Timeout occurred while processing the URL 'https://www.hcltech.com'. Skipping.
URL 'https://www.yourstory.com' returned a '403 Forbidden' status code. Skipping.



Processing URLs:  78%|██████████████████▊     | 199/254 [01:19<00:15,  3.54it/s]

An error occurred while processing the URL 'https://www.uou-edu.com': ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


Processing URLs:  85%|████████████████████▍   | 216/254 [01:25<00:11,  3.42it/s]

URL 'https://www.techment.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  86%|████████████████████▋   | 219/254 [01:25<00:08,  4.14it/s]

URL 'https://www.shipbob.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  89%|█████████████████████▎  | 225/254 [01:26<00:06,  4.59it/s]

Timeout occurred while processing the URL 'https://www.maersk.com'. Skipping.



Processing URLs:  89%|█████████████████████▎  | 226/254 [01:26<00:06,  4.26it/s]

An error occurred while processing the URL 'https://www.srsl.in': HTTPSConnectionPool(host='www.srsl.in', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.srsl.in' doesn't match 'blackpanther.dnsracks.com'")))
An error occurred while processing the URL 'https://www.jaraware.com': HTTPSConnectionPool(host='www.jaraware.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.jaraware.com' doesn't match either of 'jaraware.com', 'preview.jaraware.com'")))



Processing URLs:  91%|█████████████████████▋  | 230/254 [01:27<00:04,  5.24it/s]

URL 'https://www.tigeranalytics.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  93%|██████████████████████▏ | 235/254 [01:29<00:04,  3.84it/s]

URL 'https://www.joytechnologies.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  93%|██████████████████████▍ | 237/254 [01:29<00:04,  3.93it/s]

URL 'https://www.srmist.edu.in' returned a '403 Forbidden' status code. Skipping.


Processing URLs:  98%|███████████████████████▌| 250/254 [01:32<00:01,  3.49it/s]

URL 'https://www.dream11.com' returned a '403 Forbidden' status code. Skipping.


Processing URLs: 100%|███████████████████████▉| 253/254 [02:05<00:07,  7.74s/it]

Timeout occurred while processing the URL 'https://www.kanpuruniversity.org'. Skipping.


Processing URLs: 100%|████████████████████████| 254/254 [02:29<00:00,  1.70it/s]

Timeout occurred while processing the URL 'https://www.alterego-technology.com'. Skipping.
Results saved to '/Users/rentsher/Desktop/output_results_with_counts_4.csv'.



