In [7]:
import requests

def get_wikipedia_pages(keyword, limit=500000):
    base_url = "https://en.wikipedia.org/w/api.php"
    pages = []
    sroffset = 0

    while len(pages) < limit:
        params = {
            "action": "query",
            "list": "search",
            "srsearch": keyword,
            "format": "json",
            "srlimit": 50,  # Maximum allowed per request
            "sroffset": sroffset,
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        # Check if the 'query' key exists in the response
        if 'query' not in data or 'search' not in data['query']:
            print("No more results or API limit exceeded.")
            break

        # Extract page titles and create full links
        for page in data['query']['search']:
            page_title = page['title'].replace(" ", "_")
            page_url = f"https://en.wikipedia.org/wiki/{page_title}"
            pages.append(page_url)

        # Update the offset for the next batch
        sroffset += 50

        # Stop if there are no more results
        if len(data['query']['search']) < 50:
            print("No more pages to fetch.")
            break

    return pages

# Get Wikipedia pages related to "Lesbian"
pages = get_wikipedia_pages("Transexual", limit=5000)

# Print the links
for i, page in enumerate(pages, 1):
    print(f"{i}. {page}")

No more pages to fetch.
1. https://en.wikipedia.org/wiki/Transsexual
2. https://en.wikipedia.org/wiki/The_Transexual_Menace
3. https://en.wikipedia.org/wiki/Miriam_Rivera
4. https://en.wikipedia.org/wiki/Quentin_Dupieux
5. https://en.wikipedia.org/wiki/Violence_against_transgender_people
6. https://en.wikipedia.org/wiki/Carmen_Moore_(American_actress)
7. https://en.wikipedia.org/wiki/Rosa_von_Praunheim
8. https://en.wikipedia.org/wiki/Gender-affirming_hormone_therapy
9. https://en.wikipedia.org/wiki/Transgender_sexuality
10. https://en.wikipedia.org/wiki/Causes_of_gender_incongruence
11. https://en.wikipedia.org/wiki/Blanchard's_transsexualism_typology
12. https://en.wikipedia.org/wiki/Be_Like_Others
13. https://en.wikipedia.org/wiki/Abril_Zamora
14. https://en.wikipedia.org/wiki/Transgender
15. https://en.wikipedia.org/wiki/Transgender_pornography
16. https://en.wikipedia.org/wiki/Non-binary_gender
17. https://en.wikipedia.org/wiki/List_of_people_killed_for_being_transgender
18. https

In [21]:
import requests
from bs4 import BeautifulSoup

def get_bing_search_results(query, limit=50):
    base_url = "https://www.bing.com/search"
    results = []
    offset = 0

    while len(results) < limit:
        params = {
            "q": query,
            "first": offset
        }

        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all search result links
        links = soup.find_all('a', href=True)
        found_links = False

        for link in links:
            href = link['href']
            if href.startswith('http'):
                results.append(href)
                found_links = True
                if len(results) >= limit:
                    break
        
        # If no more results or less than expected results are found, break
        if not found_links or len(links) < 10:
            print("No more pages to fetch or less results found.")
            break

        offset += 10  # Move to the next page of results

    return results

# Get search results related to "Lesbian"
links = get_bing_search_results("Lesbian", limit=50000)

# Print the links
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")


ConnectionError: HTTPSConnectionPool(host='www.bing.com', port=443): Max retries exceeded with url: /search?q=Lesbian&first=1410 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fd9cedb2f40>: Failed to resolve 'www.bing.com' ([Errno -2] Name or service not known)"))

In [11]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_bing_search_results(query, limit=50):
    base_url = "https://www.bing.com/search"
    results = []
    offset = 0

    # Initialize the tqdm progress bar
    with tqdm(total=limit, desc="Fetching results", unit="link") as pbar:
        while len(results) < limit:
            params = {
                "q": query,
                "first": offset
            }

            response = requests.get(base_url, params=params)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all search result links
            links = soup.find_all('a', href=True)
            found_links = False

            for link in links:
                href = link['href']
                if href.startswith('http'):
                    results.append(href)
                    found_links = True
                    pbar.update(1)  # Update progress bar
                    if len(results) >= limit:
                        break
            
            # If no more results or fewer than expected results are found, break
            if not found_links or len(links) < 10:
                print("No more pages to fetch or fewer results found.")
                break

            offset += 10  # Move to the next page of results

    return results

# Get search results related to "Gay"
links = get_bing_search_results("Gay", limit=20000)

# Print the links
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")


Fetching results: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:57<00:00, 37.19link/s]

1. https://en.m.wikipedia.org/wiki/Gay
2. https://en.m.wikipedia.org/wiki/Gay
3. https://en.m.wikipedia.org/wiki/Homosexuality
4. https://en.m.wikipedia.org/wiki/Homosexuality
5. https://en.m.wikipedia.org/wiki/LGBTQ
6. https://en.m.wikipedia.org/wiki/LGBTQ
7. https://www.britannica.com/topic/homosexuality
8. https://www.britannica.com/topic/homosexuality
9. https://www.britannica.com/topic/Gay-Pride
10. https://www.britannica.com/topic/Gay-Pride
11. https://gaycenter.org/community/lgbtq/
12. https://gaycenter.org/community/lgbtq/
13. https://www.apa.org/topics/lgbtq/history
14. https://www.apa.org/topics/lgbtq/history
15. https://www.bbc.com/culture/article/20210614-the-hidden-gay-lives-finally-being-uncovered
16. https://www.bbc.com/culture/article/20210614-the-hidden-gay-lives-finally-being-uncovered
17. https://www.merriam-webster.com/dictionary/gay
18. https://www.merriam-webster.com/dictionary/gay
19. https://www.britannica.com/topic/lesbian-gay-bisexual-transgender-and-queer-com




In [39]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_bing_search_results(query, limit=50):
    base_url = "https://www.bing.com/search"
    results = []
    offset = 0

    # Initialize the tqdm progress bar
    with tqdm(total=limit, desc="Fetching results", unit="link") as pbar:
        while len(results) < limit:
            params = {
                "q": query,
                "first": offset
            }

            response = requests.get(base_url, params=params)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all search result links
            links = soup.find_all('a', href=True)
            found_links = False

            for link in links:
                href = link['href']
                if href.startswith('http'):
                    results.append(href)
                    found_links = True
                    pbar.update(1)  # Update progress bar
                    if len(results) >= limit:
                        break
            
            # If no more results or fewer than expected results are found, break
            if not found_links or len(links) < 10:
                print("No more pages to fetch or fewer results found.")
                break

            offset += 10  # Move to the next page of results

    return results

# Get search results related to "Gay"
links = get_bing_search_results("Bisexual", limit=9000)

# Print the links
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")


Fetching results: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9000/9000 [03:56<00:00, 38.12link/s]

1. https://en.wikipedia.org/wiki/Bisexuality
2. https://en.wikipedia.org/wiki/Bisexuality
3. https://www.healthline.com/health/what-is-bisexual
4. https://www.healthline.com/health/what-is-bisexual
5. https://www.healthline.com/health/am-i-bisexual
6. https://www.healthline.com/health/am-i-bisexual
7. https://www.webmd.com/sex/what-is-bisexual
8. https://www.webmd.com/sex/what-is-bisexual
9. https://lgbtqiacounseling.com/bisexual/types-of-bisexuality/
10. https://lgbtqiacounseling.com/bisexual/types-of-bisexuality/
11. https://www.apa.org/pi/lgbt/resources/bisexual
12. https://www.apa.org/pi/lgbt/resources/bisexual
13. https://www.britannica.com/topic/bisexuality-human-behavior
14. https://www.britannica.com/topic/bisexuality-human-behavior
15. https://helloclue.com/articles/lgbt/bisexuality-101
16. https://helloclue.com/articles/lgbt/bisexuality-101
17. https://www.thetrevorproject.org/resources/article/understanding-bisexuality/
18. https://www.thetrevorproject.org/resources/article/




In [7]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, Timeout, RequestException
from tqdm import tqdm

def get_bing_search_results(query, limit=100):
    base_url = "https://www.bing.com/search"
    results = []
    offset = 0
    timeout_duration = 5  # Timeout set to 5 seconds

    # Initialize the tqdm progress bar
    with tqdm(total=limit, desc="Fetching Links", unit="link") as pbar:
        while len(results) < limit:
            params = {
                "q": query,
                "first": offset
            }

            try:
                # Try sending a request to Bing
                response = requests.get(base_url, params=params, timeout=timeout_duration)
                response.raise_for_status()  # Check if the request was successful (HTTP status code 200)

                # Parse the page content
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find all search result links
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if href.startswith('http'):
                        results.append(href)
                        pbar.update(1)  # Update the tqdm progress bar
                        if len(results) >= limit:
                            break

                # Increment offset for pagination
                offset += 10

            except (ConnectionError, Timeout) as conn_err:
                # If connection issues occur, skip to the next page of results
                print(f"Connection error: {conn_err}. Skipping this offset {offset}.")
                offset += 10  # Move on to the next page
                continue

            except RequestException as req_err:
                # If other types of errors occur, skip to the next page
                print(f"Request failed: {req_err}. Skipping this offset {offset}.")
                offset += 10  # Move on to the next page
                continue

    return results

# Example usage
query = "Queer"
links = get_bing_search_results(query, limit=50000)

# Print the links
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")


Fetching Links:  18%|█████████████████████████▊                                                                                                                      | 8965/50000 [03:54<16:15, 42.05link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 3920.


Fetching Links:  34%|████████████████████████████████████████████████                                                                                               | 16813/50000 [07:55<12:50, 43.06link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 7400.


Fetching Links:  52%|██████████████████████████████████████████████████████████████████████████▏                                                                    | 25925/50000 [13:45<10:50, 36.98link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 11390.


Fetching Links:  56%|████████████████████████████████████████████████████████████████████████████████▋                                                              | 28209/50000 [15:04<12:29, 29.06link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 12360.


Fetching Links:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████                                             | 34293/50000 [18:37<06:09, 42.55link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 15030.


Fetching Links:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████                                | 38829/50000 [21:38<06:46, 27.45link/s]

Connection error: HTTPSConnectionPool(host='www.bing.com', port=443): Read timed out. (read timeout=5). Skipping this offset 17030.


Fetching Links: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [29:24<00:00, 28.34link/s]


1. https://en.wikipedia.org/wiki/Queer
2. https://en.wikipedia.org/wiki/Queer
3. https://www.dictionary.com/e/queer-vs-gay/
4. https://www.dictionary.com/e/queer-vs-gay/
5. https://dictionary.cambridge.org/dictionary/english/queer
6. https://dictionary.cambridge.org/dictionary/english/queer
7. https://www.medicalnewstoday.com/articles/what-does-queer-mean
8. https://www.medicalnewstoday.com/articles/what-does-queer-mean
9. https://www.them.us/story/what-does-queer-mean
10. https://www.them.us/story/what-does-queer-mean
11. https://www.merriam-webster.com/dictionary/queer
12. https://www.merriam-webster.com/dictionary/queer
13. https://new.lgbtqia.wiki/wiki/Queer
14. https://new.lgbtqia.wiki/wiki/Queer
15. https://gaycenter.org/community/lgbtq/
16. https://gaycenter.org/community/lgbtq/
17. https://www.identiversity.org/topics/lgbtq-identities/the-q-in-lgbtq/
18. https://www.identiversity.org/topics/lgbtq-identities/the-q-in-lgbtq/
19. https://en.wikipedia.org/wiki/LGBTQ
20. https://en.

In [2]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, Timeout, RequestException
from tqdm import tqdm

def get_bing_search_results(query, limit=100):
    base_url = "https://www.google.com/search"
    results = []
    offset = 0
    timeout_duration = 5  # Timeout set to 5 seconds
    failed_offsets = set()  # Set to keep track of failed offsets

    # Initialize the tqdm progress bar
    with tqdm(total=limit, desc="Fetching Links", unit="link") as pbar:
        while len(results) < limit:
            # Skip failed offsets
            if offset in failed_offsets:
                offset += 10
                continue

            params = {
                "q": query,
                "first": offset
            }

            try:
                # Try sending a request to Bing
                response = requests.get(base_url, params=params, timeout=timeout_duration)
                response.raise_for_status()  # Check if the request was successful (HTTP status code 200)

                # Parse the page content
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find all search result links
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if href.startswith('http'):
                        results.append(href)
                        pbar.update(1)  # Update the tqdm progress bar
                        if len(results) >= limit:
                            break

                # Increment offset for pagination
                offset += 10

            except (ConnectionError, Timeout) as conn_err:
                # If connection issues occur, add offset to failed_offsets and skip
                print(f"Connection error: {conn_err}. Skipping this offset {offset}.")
                failed_offsets.add(offset)
                offset += 10
                continue

            except RequestException as req_err:
                # If other types of errors occur, add offset to failed_offsets and skip
                print(f"Request failed: {req_err}. Skipping this offset {offset}.")
                failed_offsets.add(offset)
                offset += 10
                continue

    return results

# Example usage
query = "Queer"
links = get_bing_search_results(query, limit=100)

# Print the links
for i, link in enumerate(links, 1):
    print(f"{i}. {link}")


Fetching Links:   0%|                                                                         | 0/100 [00:00<?, ?link/s]

Request failed: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DQueer%26first%3D0&q=EgQOi8VCGJT_sbcGIjDjZooTs-1cuIuqO2Hd_oGBLBBIRPfFtKKmhjiwPH95ffWMn4n9Ni1cjZ5Up2TkRhkyAXJaAUM. Skipping this offset 0.
Request failed: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DQueer%26first%3D10&q=EgQOi8VCGJb_sbcGIjC_3s_1yCDdvGVDRKltKlMaRiWqnVJ2fi7eoncbHha1eFbfRBYmQwZfkHmG1XuyBFkyAXJaAUM. Skipping this offset 10.
Request failed: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DQueer%26first%3D20&q=EgQOi8VCGJf_sbcGIjC6UxnJ_07V6Gn9rncxbTC-nWeW4B4TONK59w6k34XZPYIVahyD2qqlKC3P6stEfhYyAXJaAUM. Skipping this offset 20.
Request failed: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DQueer%26first%3D30&q=

Fetching Links:   0%|                                                                         | 0/100 [02:40<?, ?link/s]


KeyboardInterrupt: 

Trying out Akshit Sir's Code for finding site wise data 

In [9]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Function to collect all unique URLs from the search results of a given website and keyword
def get_britannica_search_results(search_query):
    # Create the search URL for Britannica (with the appropriate path-based query)
    base_url = "https://www.britannica.com"
    search_url = f"https://www.britannica.com/search?query={search_query}"
    
    try:
        # Get the search results page
        response = requests.get(search_url)
        response.raise_for_status()  # Check for request errors
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all <a> tags with href attributes
    links = set()
    for a_tag in soup.find_all('a', href=True):
        link = urljoin(base_url, a_tag['href'])
        # Only collect unique and valid URLs within the same domain
        if link.startswith(base_url):
            links.add(link)

    return links

if __name__ == "__main__":
    # Define the keyword to search on Britannica
    keyword = "Lesbian"  # Change this to your desired search term
    
    # Get the unique links from the Britannica search results
    links = get_britannica_search_results(keyword)
    
    if links:
        print(f"\nUnique URLs found for '{keyword}' on Britannica:")
        for idx, link in enumerate(links, 1):
            print(f"{idx}. {link}")
    else:
        print(f"No links found for '{keyword}' on Britannica.")



Unique URLs found for 'Lesbian' on Britannica:
1. https://www.britannica.com/browse/Geography-Travel
2. https://www.britannica.com/topic/Parents-Families-and-Friends-of-Lesbians-and-Gays
3. https://www.britannica.com/browse/World-History
4. https://www.britannica.com/Science-Tech
5. https://www.britannica.com/stories/demystified
6. https://www.britannica.com/chatbot
7. https://www.britannica.com/
8. https://www.britannica.com/History-Society
9. https://www.britannica.com/stories/the-forum
10. https://www.britannica.com/topic/lesbian-feminism
11. https://www.britannica.com/new-articles
12. https://www.britannica.com/money
13. https://www.britannica.com/study/infographics
14. https://www.britannica.com/explore/100women/
15. https://www.britannica.com/topic/International-Lesbian-Gay-Bisexual-Trans-and-Intersex-Association
16. https://www.britannica.com/contributor/Genny-Beemyn/9536811
17. https://www.britannica.com/biography/Natalie-Barney
18. https://www.britannica.com/browse/Plants
19.

In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Function to collect all unique URLs from the search results of a given website and keyword
def get_britannica_search_results(search_query):
    # Create the search URL for Britannica (with the appropriate path-based query)
    base_url = "https://www.britannica.com"
    search_url = f"https://www.britannica.com/search?query={search_query}"
    
    try:
        # Get the search results page
        response = requests.get(search_url)
        response.raise_for_status()  # Check for request errors
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all <a> tags with href attributes
    links = set()
    for a_tag in soup.find_all('a', href=True):
        link = urljoin(base_url, a_tag['href'])
        # Only collect unique and valid URLs within the same domain
        if link.startswith(base_url):
            links.add(link)

    return links

if __name__ == "__main__":
    # Define the keyword to search on Britannica
    keyword = "Gay"  # Change this to your desired search term
    
    # Get the unique links from the Britannica search results
    links = get_britannica_search_results(keyword)
    
    if links:
        print(f"\nUnique URLs found for '{keyword}' on Britannica:")
        for idx, link in enumerate(links, 1):
            print(f"{idx}. {link}")
    else:
        print(f"No links found for '{keyword}' on Britannica.")



Unique URLs found for 'Gay' on Britannica:
1. https://www.britannica.com/browse/Geography-Travel
2. https://www.britannica.com/browse/World-History
3. https://www.britannica.com/place/Esfahan
4. https://www.britannica.com/Science-Tech
5. https://www.britannica.com/biography/Roxane-Gay
6. https://www.britannica.com/stories/demystified
7. https://www.britannica.com/chatbot
8. https://www.britannica.com/science/Gay-Lussacs-law-of-combining-volumes
9. https://www.britannica.com/
10. https://www.britannica.com/History-Society
11. https://www.britannica.com/stories/the-forum
12. https://www.britannica.com/list/5-important-places-in-global-lgbtqia-history
13. https://www.britannica.com/biography/Ross-Gay
14. https://www.britannica.com/one-good-fact/why-is-june-gay-pride-month
15. https://www.britannica.com/new-articles
16. https://www.britannica.com/biography/Gay-Talese
17. https://www.britannica.com/money
18. https://www.britannica.com/study/infographics
19. https://www.britannica.com/explo