In [1]:
import random
from newspaper import Article
from tqdm import tqdm
# pip install newspaper3k, lxml, lxml-html-clean

In [2]:
def get_user_agent():
    """Returns a random user agent."""
    available = (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0',
    )
    return available[random.randint(0, len(available) -1)]

In [3]:
default_headers = {
    'User-Agent': get_user_agent(),
    'Accept': 'text/html',
    'Accept-Language': 'en-US',
    'Connection': 'keep-alive',
    'Referer': 'https://www.google.com',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1'
}

In [4]:
from functools import lru_cache

@lru_cache(maxsize=64)
def get_text(link: str):   
    a = Article(
        link, 
        headers=default_headers,
        fetch_images=False
    )
    a.download()
    a.parse()
    return a.text, a.tags

In [43]:
import requests
from urllib.parse import urlparse

from bs4 import BeautifulSoup

exclusion = [
    'youtube.com'
]

def __exclude(link: str):
    # check if link is blacklisted
    parsed = urlparse(link)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed)
    
    if domain.startswith('https://www.'):
        domain = domain[len('https://www.'):]
    if domain.endswith('/'):
        domain = domain.replace('/', '')

    return domain in exclusion


def google_search(search_query, results = 3, timeout: int = 3):
    try:
        response = requests.get(
            url="https://www.google.com/search",
            headers=default_headers,
            params={
                "q": search_query,
                "num": results,
                "start": 0,
                "safe": "active",
            },
            timeout=timeout,
        )
        response.raise_for_status()
    except requests.HTTPError as req_err:
        print(f'[!] Error: {req_err}')
        return 
        
    # Parse
    links = set()
    
    soup = BeautifulSoup(response.text, "html.parser")
    result_block = soup.find_all("div", attrs={"class": "g"})
    for result in result_block:
        link = result.find("a", href=True)
        if link and not __exclude(link['href']):
            
            links.add(link["href"])
    return list(links)

In [44]:
links = google_search('Apache Log Poisoning')

In [45]:
links

['https://www.hackingarticles.in/apache-log-poisoning-through-lfi/',
 'https://attackdefense.com/challengedetails?cid=916']

In [9]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
progress_bar = tqdm(total=len(links), unit="step")
with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(get_text, lnk) for lnk in links]
    for future in as_completed(futures):
        future.result()
        progress_bar.update(1)
        progress_bar.refresh()
progress_bar.close()

100%|██████████| 1409/1409 [11:13<00:00,  2.09step/s]
