In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Generate a dataframe with websites

In [None]:
# Get the good links from main wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_hedge_funds"

# Send GET request
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful

# Parse HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Extract all <a> tags and collect their href attributes
links = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag["href"]
    # Wikipedia internal links often start with "/wiki/"
    if href.startswith("/wiki/") or href.startswith("http"):
        full_link = requests.compat.urljoin(url, href)
        links.append(full_link)

# Remove duplicates while preserving order
unique_links = list(dict.fromkeys(links))

# Remopve all the useless links
unique_links = unique_links[59:192]

In [None]:
# Get the links to every website

def get_website(urls):
    res = []

    for k, url in enumerate(urls):
        # Fetch the page
        response = requests.get(url)
        response.raise_for_status()

        # Parse the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the <th> with the text 'Website'
        website_th = soup.find("th", class_="infobox-label", string="Website")

        if website_th:
            # Get the next <td> after this <th>
            website_td = website_th.find_next_sibling("td")
            if website_td:
                a_tag = website_td.find("a", href=True)
                if a_tag:
                    website_link = a_tag["href"]
                    res.append(website_link)
                else:
                    res.append(False)
                    print(k)
                    print("No link found in Website field.", url)
            else:
                res.append(False)
                print("No <td> found after Website label.", url)
        else:
            res.append(False)
            print("No Website label found.", url)


    return res

res = get_website(unique_links)
res[128] = 'https://www.rokoscapital.com/'

No Website label found. https://en.wikipedia.org/wiki/Archegos_Capital_Management
No Website label found. https://en.wikipedia.org/wiki/Baker_Brothers_Advisors
No Website label found. https://en.wikipedia.org/wiki/D1_Capital_Partners
No Website label found. https://en.wikipedia.org/wiki/Discovery_Capital_Management
No Website label found. https://en.wikipedia.org/wiki/Ellington_Management_Group
No Website label found. https://en.wikipedia.org/wiki/ESL_Investments
No Website label found. https://en.wikipedia.org/wiki/Glenview_Capital_Management
No Website label found. https://en.wikipedia.org/wiki/GoldenTree_Asset_Management
No Website label found. https://en.wikipedia.org/wiki/Paulson_%26_Co.
No Website label found. https://en.wikipedia.org/wiki/Quantum_Fund
No Website label found. https://en.wikipedia.org/wiki/Touradji_Capital_Management
128
No link found in Website field. https://en.wikipedia.org/wiki/Rokos_Capital_Management
No Website label found. https://en.wikipedia.org/wiki/Scip

In [None]:
# Clean and save the final dataframe

df = pd.DataFrame([], columns = ['wiki', 'website'])
df['wiki'] = unique_links
df['website'] = res
df['company'] = df['wiki'].str.split('/wiki/').apply(lambda x: x[-1])
df.to_csv('company_websites.csv')

# Extract the data we need

In [2]:
df = pd.read_csv('company_websites.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,wiki,website,company
0,0,https://en.wikipedia.org/wiki/Acadian_Asset_Ma...,http://www.acadian-asset.com,Acadian_Asset_Management
1,1,https://en.wikipedia.org/wiki/Adage_Capital_Ma...,http://www.adagecapital.com,Adage_Capital_Management
2,2,https://en.wikipedia.org/wiki/Alphadyne_Asset_...,http://adyne.com,Alphadyne_Asset_Management
3,3,https://en.wikipedia.org/wiki/AlphaSimplex_Group,http://www.alphasimplex.com,AlphaSimplex_Group
4,4,https://en.wikipedia.org/wiki/Altimeter_Capital,http://altimeter.com,Altimeter_Capital


In [4]:
import requests
from bs4 import BeautifulSoup
import re

def fetch_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_emails_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    emails = set()

    # Look for mailto links
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('mailto:'):
            email = href[7:]  # Remove 'mailto:'
            emails.add(email)

    # Also look in the plain text
    text_content = soup.get_text()
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails.update(re.findall(email_pattern, text_content, re.IGNORECASE))

    return list(emails)

def get_emails_from_url(url):
    html_content = fetch_page(url)
    if html_content is None:
        return []
    emails = extract_emails_from_html(html_content)
    return list(set(emails))  # Remove duplicates

def get_emails_from_urls(urls):
    email_dict = {}
    for url in urls:
        print(f"Processing: {url}")
        emails = get_emails_from_url(url)
        email_dict[url] = emails
    return email_dict

def read_urls_from_file(file_path):
    with open(file_path, 'r') as file:
        urls = [line.strip() for line in file if line.strip()]
    return urls

def write_emails_to_file(email_dict, file_path):
    with open(file_path, 'w') as file:
        for url, emails in email_dict.items():
            file.write(f"URL: {url}\n")
            file.write("Emails found: " + ", ".join(emails) + "\n\n")

# if __name__ == "__main__":
#     # Example usage with file input/output
#     input_file = "urls.txt"  # File containing URLs, one per line
#     output_file = "emails.txt"  # File to save the results

#     urls = read_urls_from_file(input_file)
#     emails = get_emails_from_urls(urls)
#     write_emails_to_file(emails, output_file)
#     print(f"Results saved to {output_file}")


In [6]:
emails = get_emails_from_urls(df['website'].iloc[:10])

Processing: http://www.acadian-asset.com
Processing: http://www.adagecapital.com
Processing: http://adyne.com
Processing: http://www.alphasimplex.com
Processing: http://altimeter.com
Processing: http://angelogordon.com
Error fetching http://angelogordon.com: 403 Client Error: Forbidden for url: http://angelogordon.com/
Processing: http://amlp.com
Error fetching http://amlp.com: HTTPConnectionPool(host='amlp.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000001E2DE3FB020>: Failed to resolve 'amlp.com' ([Errno 11001] getaddrinfo failed)"))
Processing: http://aqr.com
Processing: False
Error fetching False: Invalid URL 'False': No scheme supplied. Perhaps you meant https://False?
Processing: https://web.archive.org/web/20230619183139/https://www.assuredinvestmentmanagement.com/


In [7]:
emails

{'http://www.acadian-asset.com': [],
 'http://www.adagecapital.com': [],
 'http://adyne.com': ['LegalCompliance@adyne.com'],
 'http://www.alphasimplex.com': [],
 'http://altimeter.com': ['IR@altimeter.com',
  'press@altimeter.com',
  'info@altimetercapital.com'],
 'http://angelogordon.com': [],
 'http://amlp.com': [],
 'http://aqr.com': [],
 'False': [],
 'https://web.archive.org/web/20230619183139/https://www.assuredinvestmentmanagement.com/': []}