<a href="https://colab.research.google.com/github/YasserBad/IBBA.org-Scraper/blob/main/IBBA_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing the needed libraries
!pip install bs4 pandas

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1256 sha256=2b611de19dcb82f05bdae5701b8c95b9286cac0eed49275fe108d8a410ee95b3
  Stored in directory: /root/.cache/pip/wheels/25/42/45/b773edc52acb16cd2db4cf1a0b47117e2f69bb4eb300ed0e70
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [3]:
#Imports
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures
import json
import pandas as pd


# Available states in IBBA.org

available_states_list = [
    "alabama",
    "alaska",
    "arizona",
    "arkansas",
    "california",
    "colorado",
    "connecticut",
    "delaware",
    "district-of-columbia",
    "florida",
    "georgia",
    "hawaii",
    "idaho",
    "illinois",
    "indiana",
    "iowa",
    "kansas",
    "kentucky",
    "louisiana",
    "maine",
    "maryland",
    "massachusetts",
    "michigan",
    "minnesota",
    "mississippi",
    "missouri",
    "montana",
    "nebraska",
    "nevada",
    "new-hampshire",
    "new-jersey",
    "new-mexico",
    "new-york",
    "north-carolina",
    "north-dakota",
    "ohio",
    "oklahoma",
    "oregon",
    "pennsylvania",
    "south-carolina",
    "south-dakota",
    "tennessee",
    "texas",
    "utah",
    "vermont",
    "virgin-islands",
    "virginia",
    "washington",
    "west-virginia",
    "wisconsin",
    "wyoming"
]

print('Started')


# Emails in IBBA.org use CloudFlare's Emal Obfuscation. This little function decodes the encrypted email.
def cfDecodeEmail(encodedString):
    r = int(encodedString[:2],16)
    email = ''.join([chr(int(encodedString[i:i+2], 16) ^ r) for i in range(2, len(encodedString), 2)])
    return email


# Here, we check if the state exists in the site.
def check_if_state_exists(state_name):
	if not state_name.lower().strip().replace(' ', '-') in available_states_list:
		return False
	else:
		return True

# Here, We process the names of the states in case the user makes a mistake, or puts on extra whitespaces or w/e.
def preprocess_state_name(state_name):
	return "-".join(state_name.split()).lower()

# Here, We use the concurrent.futures library to send the requests to the website simultanously, I've checked and the site isn't using any protection
# against multiple requests from the same IP, so I haven't implemented any Proxy / IP rotation.
def fetch_detailed_info(card_links, count):
	results = []
	if count > len(card_links):
		links_to_fetch = card_links
	else:
		links_to_fetch = card_links[:count]

	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
		futures = [executor.submit(fetch_one_detailed_info, link) for link in links_to_fetch]

	for future in concurrent.futures.as_completed(futures):
		result = future.result()
		results.append(result)

	return results


# This is the function that scrapes the link of the profile of each user, returning the name, address, city ...... If it doesn't find a value, it returns "N/A"
def fetch_one_detailed_info(card_link):
	result = {}
	print(f"Fetching {card_link}")
	response = requests.get(card_link)
	if response.status_code == 200:
		soup = BeautifulSoup(response.content, "html.parser")
		title = soup.find('h1', attrs={"class" : "brokers__profile--informationName"}).text.strip()
		if not title:
			title = "N/A"
		image = soup.find('div', attrs={"class" : "brokers__profile--image"}).figure.img['src']
		if not image:
			image = "N/A"
		information_card = soup.find('div', attrs={"class" : "brokers__profile--left"})
		mail_and_phone = soup.find('div', attrs={"class" : "brokers__profile--left"}).find_all("div", attrs={"class" : "brokers__profile--leftPhone"})
		city = information_card.find('div', attrs={"class" : "brokers__profile--leftCity"})
		city_name = re.sub(r'\s+', ' ', city.text).strip()
		phone_number = None
		email = None
		if not city_name:
			city_name = "N/A"
		address_element = information_card.find('div', attrs={"class" : "brokers__profile--leftAddress"})
		if address_element:
			address = address_element.span.text
		else:
			address = "N/A"
		for element in mail_and_phone:
			if "tel" in element.a['href']:
				phone_number = element.a['href'].split(':')[1]
				if not phone_number:
					phone_number = "N/A"
			elif "email" in element.a['href']:
				email = cfDecodeEmail(element.a.span['data-cfemail'])
				if not email:
					email = "N/A"

	else:
		return f"Error {response.status_code}"


	result['title'] = title
	result['image'] = image
	result['city'] = city_name
	result['address'] = address
	result['phone_number'] = phone_number
	result['email'] = email

	return result



# This is the main function that scrapes all profiles by STATE NAME, You can also set the number of results you want.
# If you want them all, Just set it to a very large number, something like 10000 for example.
def fetch_brokers_in_state(state_name, count = 50):
	card_links = []
	processed_state_name = preprocess_state_name(state_name)
	if check_if_state_exists(processed_state_name):
		url = f"https://www.ibba.org/state/{processed_state_name}/"
		response = requests.get(url)
		if response.status_code == 200:
			soup = BeautifulSoup(response.content, "html.parser")
			cards = soup.find_all('div', attrs={"class" : "brokers__box"})
			for card in cards:
				card_title = card.find('h4', attrs={"class" : "brokers__item--topTitle"})
				card_link = card_title.find('a')['href'].strip()
				card_links.append(card_link)
		else:
			return f"Error fetching {url} - ERROR : {response.status_code}"
	else:
		return "This state is not available."

	results = fetch_detailed_info(card_links, count)

	return results


# This is the main function that writes all the results into an excel file.
def main():

	data = fetch_brokers_in_state("florida", 10000000) # Here you also set the number of results you want.
	# Create a Pandas DataFrame
	df = pd.DataFrame(data)
	# Save the DataFrame to an Excel file
	excel_filename = "output_brokers.xlsx"
	df.to_excel(excel_filename, index=False)
	print(f"Data saved to '{excel_filename}'")



if __name__ == "__main__":
	main()

Started
Fetching https://www.ibba.org/broker-profile/florida/clermont/joseph-shemansky/
Fetching https://www.ibba.org/broker-profile/florida/winter-park/jon-franz/
Fetching https://www.ibba.org/broker-profile/florida/jacksonville/terri-sherman/
Fetching https://www.ibba.org/broker-profile/florida/orlando/simon-harrison/
Fetching https://www.ibba.org/broker-profile/florida/saint-augustine/tracey-burke/
Fetching https://www.ibba.org/broker-profile/florida/sarasota/rick-gardner/
Fetching https://www.ibba.org/broker-profile/florida/cocoa-beach/kent-cooper/
Fetching https://www.ibba.org/broker-profile/florida/fort-lauderdale/andrew-cagnetta/
Fetching https://www.ibba.org/broker-profile/florida/cooper-city/russell-cohen/
Fetching https://www.ibba.org/broker-profile/florida/wellington/brad-coffman/
Fetching https://www.ibba.org/broker-profile/florida/fort-myers/bruce-pockrandt/
Fetching https://www.ibba.org/broker-profile/florida/boca-raton/baris-guler/
Fetching https://www.ibba.org/broker-pr