In [4]:
%%capture
%pip install firecrawl-py
%pip install PyPDF2
%pip install itables

# !rm -r github_broker_address
# !git clone https://github.com/auskfullex/github_broker_address.git

In [5]:
import requests
import json
from typing import List, Dict, Any, Optional
import time
import re

import logging
# from google.colab import userdata

# FireCrawl user info
headers = {
            # "Authorization": f"Bearer {userdata.get('firecrawl_api_key')} ",
            "Authorization": f"Bearer fc-df28f822ed684a03a69cb23d5d560487",
            "Content-Type": "application/json"
        }

def remove_duplicates_frozenset(list_of_dicts):
    unique_dicts = []
    seen = set()
    
    for d in list_of_dicts:
        # print(d.items())
        # Convert dict to a frozenset of its items
        items = frozenset(d.items())
        if items not in seen:
            seen.add(items)
            unique_dicts.append(d)
    
    return unique_dicts

def search_broker_via_api():

    # Constructing query to include individuals (excluding firms) and the search term
    identified_count = 100
    identified_individuals = []
    existing_crds = set()

    try:

        # Reiterate the firecrawl response until no more matching is found
        # i = 1
        while identified_count != 0:
            
            base_query = f'site:brokercheck.finra.org inurl:files inurl:individual "{analys_name}"'  
            
            # Updating the query that excludes searched results so far.
            query = ' '.join([base_query,'-'+' -'.join(existing_crds)]) if existing_crds else base_query
            # print(f"Round {i} starting. Query for this round is:{query}")

            payload = {
                "query": query,
                "limit": 50
            }

            response = requests.post(
                "https://api.firecrawl.dev/v1/search",
                headers=headers,
                json=payload
            )

            results = response.json().get('data','')

            # Adding new results
            results_w_crd = []
            new_crds = set()
            for result in results:
                crd = re.search(r'(?:individual_|individual/summary/)(\d+)',result.get('url', ""), re.IGNORECASE)
                if crd:
                    result['crd'] = crd.group(1)
                    results_w_crd.append(result)
                    new_crds.add(crd.group(1))

            identified_individuals.extend(results_w_crd)

            existing_crds_old = existing_crds.copy() 
            existing_crds.update(new_crds)
            identified_count = len(existing_crds.difference(existing_crds_old))
            
            # print(f"Round {i} completed. Among {len(new_crds)} analyst(s) searched, {identified_count} are newly added.")
            # print(existing_crds)
            # i += 1

        return remove_duplicates_frozenset(identified_individuals)

    except requests.RequestException as e:
        print(f"API request error: {e}")
        return []
    except json.JSONDecodeError:
        print("Error parsing API response")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

def filtering_identified_data(results):
    # Process the results to find BrokerCheck URLs
    matched_names = []

    for result in results:
        url = result.get("url", "")
        title = result.get("title", "").replace("[PDF] BrokerCheck Report - ","").replace("[PDF]","")
        crd = result.get("crd", "")

        # Extract name with suffix from the title
        name_w_suffix = title.split(" - ")[0].upper().strip()  if " - " in title else title.upper().strip()

        # Take out suffix from name
        name = ' '.join(name_w_suffix.split(" ")[:-1]) if name_w_suffix.split(" ")[-1] in \
                                                          ["JR","JR.","Jr","Jr.","SR","SR.","Sr","Sr.",\
                                                           "JUNIOR","Junior","SENIOR","Senior",\
                                                           "I","II","III","IV","V","1","11","111"] \
                                                       else  name_w_suffix

        # Test if the two last name matche
        last_name_match = analys_name.split(' ')[0] == name.split(' ')[-1]

        # Test if the two first name initial matches
        first_name_initial_match = [name_not_last[0] == analys_name.split(' ')[-1] for name_not_last in name.split(" ")[:-1]]

        if last_name_match and sum(first_name_initial_match) >= 1:
            matched_names.append({
                "name": name,
                "url": f'https://files.brokercheck.finra.org/individual/individual_{crd}.pdf',
                "crd": crd
            })

    matched_names_nodup = []
    for matched_name in matched_names:
        if matched_name not in matched_names_nodup:
            matched_names_nodup.append(matched_name)
    count_matches = len(matched_names_nodup)
    print(f"{count_matches} unique analyst names are matched.")
    
    if count_matches>0: 
        return matched_names_nodup
    else: 
        matched_names_nodup.append({
                "name": "Unavailable",
                "url": None,
                "crd": "Unavailable"
            })
        return matched_names_nodup

In [None]:
import re
from PyPDF2 import PdfReader
from typing import Dict, Any, List, Tuple
import io
from io import BytesIO

def extract_brokercheck_data(name, url: str, crd: str) -> Dict[str, Any]:
    # Initialize result dictionary
    result = {
        "analys": analys,
        "analys_name": analys_name,
        "name": name,
        "url": None,
        "crd": crd,
        "registration_history": [],
        "exams": [],
        "professional_designations": None
    }
  
    print(f"\tExtracting data for {name} (CRD#: {crd})...")
    try:
        with requests.get(url, stream=True) as response:
            print(f"\tRequesting the URL...")
            response.raise_for_status()
            print(f"\tReceiving the response from the request...")
            
            

            # Open and read the PDF file
            pdf_file = BytesIO(response.content)
            print(f"\tReading pdf using BytesIO...")

            pdf_reader = PdfReader(pdf_file)
            print(f"\tReading by PdfReader...")

            # Extract text from all pages
            text_all = " ".join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])
            print(f"\tReading the entire text...")
            # print(text_all)

            # Full name
            result["name"] = name

            # CRD number
            result["crd"] = crd
            result["url"] = url

            # Summary page
            summary_page_text = pdf_reader.pages[2].extract_text()
            print(f"\tReading summary page...")
            # print(summary_page_text)

            # Current Broker
            current_brokers_block = re.search(r'Currently employed by and registered with the\nfollowing Firm\(s\)\:\n((?:.+\n)+Registered with this firm since: \d{2}/\d{2}/\d{4}(?:B|IA))Report',
                                            summary_page_text)
            print(f"\tWorking on the current broker...")
            # print(current_brokers_block.group(1))

            if current_brokers_block:
                current_brokers = re.findall(r'((?:.+\n){4,7}(?:.+\d{2}/\d{2}/\d{4}(?:B|IA)))\n?',
                                            current_brokers_block.group(1))

                if current_brokers:
                    for current_broker in current_brokers:
                        current_broker_items = re.search(r'((?:.+\n)*?)((?:\d+.+\n)(?:.+\n)*?)(.+),\s*([A-Z]{2})\s*\d{5}\n(?:CRD#\s*(\d+)\n)(?:.+)(\d{2}/\d{2}/\d{4})(B|IA)',
                                                current_broker)

                        if current_broker_items:
                            result["registration_history"].append({
                                "beg_date": current_broker_items.group(6).strip(),
                                "end_date": "",
                                "broker": current_broker_items.group(1).strip().replace('\n', ' '),
                                "crd_firm": current_broker_items.group(5).strip(),
                                "role_class": current_broker_items.group(7).strip(),
                                "street": current_broker_items.group(2).strip().replace('\n', ' '),
                                "city": current_broker_items.group(3).strip(),
                                "state": current_broker_items.group(4).strip()
                            })


            # Former Broker
            fmr_brokers_block = re.search(r'securities firm\(s\):\n((?:.+\n)+.+)www\.finra\.org',
                                        summary_page_text)
            print(f"\tWorking on the former broker...")

            if fmr_brokers_block:
                fmr_brokers = re.findall(r'(?:.+\n){3,6}\d{2}/\d{4}\s*-\s*\d{2}/\d{4}(?:B|IA)',
                                        fmr_brokers_block.group(1))

                if fmr_brokers:
                    for fmr_broker in fmr_brokers:
                        fmr_broker_items = re.search(r'((?:.+\n){1,4})CRD#\s*(\d+)\n(.+),\s*([A-Z]{2})\n(\d{2}/\d{4})\s*-\s*(\d{2}/\d{4})(B|IA)',
                                                    fmr_broker)

                        if fmr_broker_items:
                            result["registration_history"].append({
                                    "beg_date": fmr_broker_items.group(5).strip(),
                                    "end_date": fmr_broker_items.group(6).strip(),
                                    "broker": fmr_broker_items.group(1).strip().replace('\n', ' '),
                                    "crd_firm": fmr_broker_items.group(2).strip(),
                                    "street": "",
                                    "role_class": fmr_broker_items.group(7).strip(),
                                    "city": fmr_broker_items.group(3).strip(),
                                    "state": fmr_broker_items.group(4).strip()
                            })

            # Exam table
            exams_table = re.search(r'SRO or state\nregistration\.\n(.+\n(?:.+\n)+)Additional information about the above',
                                    text_all)
            print(f"\tWorking on the exam part...")
            # print(exams_table.group(1))

            if exams_table: 
                # Exam tables: extract each group and each line
                exams_by_type_all = re.search(r'Principal/Supervisory Exams\n((?:.+\n)*?)Exam Category DateGeneral Industry/Product Exams\n((?:.+\n)*?)Exam Category DateState Securities Law Exams\n((?:.+\n?)*?)(?=\n|$)', exams_table.group(1))
                # print(exams_by_type_all.group(3))

                if exams_by_type_all
                    for i, exams_by_type in enumerate(exams_by_type_all.groups()):
                        if i == 0: exam_type = "Principal/Supervisory Exams"
                        elif i == 1: exam_type = "General Industry/Product Exams"
                        else: exam_type = "State Securities Law Exams"

                        # Exams by each group
                        exams = re.findall(r'((?:.*?\n?)*?\d{2}/\d{2}/\d{4}.*?(?:BIA|B|IA))',exams_by_type)
                        # print(exams)

                        if exams:
                            for exam in exams:
                                exam_items = re.search(r'((?:.*?\n?)*?)(\d{2}/\d{2}/\d{4})(.*?)(BIA|B|IA)', exam)
                                if exam_items:
                                    result["exams"].append({"exam_type": exam_type,
                                                            "exam_title": exam_items.group(1).strip().replace('\n', ' '),
                                                            "exam_category": exam_items.group(3).strip().replace('\n', ' '),
                                                            "exam_date": exam_items.group(2).strip(),
                                                            "exam_class": exam_items.group(4).strip()
                                                            })
                        else:
                            result["exams"].append({"exam_type": exam_type,
                                                    "exam_title": None,
                                                    "exam_category": None,
                                                    "exam_date": None,
                                                    "exam_class": None
                                                    })

                    # Professional Designations from page 4
                    designation_match = re.search(r'This section details that the representative has reported (\d+) professional designation\(s\)\.(.*?)\s+This representative holds or did hold (\d+) professional designation',
                                                text_all, re.DOTALL)
                    print(f"\tWorking on the professional designations part...")
                    if designation_match:
                        result["professional_designations"] = designation_match.group(2).strip()

                    return result

    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDF: {e}")
        return result
# a = extract_brokercheck_data(pdf_file_path='github_broker_address/analyst_pdf_files/2565381.pdf')
# a

In [6]:
import logging
# from google.colab import userdata

def collecting_and_returning_json():
    logging.getLogger('PyPDF2').setLevel(logging.ERROR)
    print(f'Searching for an IBES analyst "{analys_name}"...')

    search_results = search_broker_via_api()
    filtered_results = filtering_identified_data(search_results)

    final_results = []
    for filtered_result in filtered_results:
        final_results.append(extract_brokercheck_data(**filtered_result))
        
    return final_results

In [None]:
from tqdm.notebook import tqdm
import csv

file_path = 'C:\\Queens College Dropbox\\Jangwon Suh\\Research\\WhyDisagree\\programs\\'

# FireCrawl user info
headers = {
            # "Authorization": f"Bearer {userdata.get('firecrawl_api_key')} ",
            "Authorization": f"Bearer fc-df28f822ed684a03a69cb23d5d560487",
            "Content-Type": "application/json"
        }

results = []
with open(f'{file_path}Analyst_data_to_scrape.csv', 'r', encoding='utf-8-sig') as file:
    reader = csv.reader(file)
    for analys, analys_name in reader:
        results.extend(collecting_and_returning_json())
        # if analys == "125": break

In [None]:
%pip install firecrawl-py --quiet
%pip install PyPDF2 --quiet

import re
from PyPDF2 import PdfReader
from typing import Dict, Any, List, Tuple
import io
from io import BytesIO
import logging
import requests
import json

def extract_brokercheck_data(name, url: str, crd: str) -> Dict[str, Any]:
    # Initialize result dictionary
    result = {
        "analys": analys,
        "analys_name": analys_name,
        "name": name,
        "url": None,
        "crd": crd,
        "registration_history": [],
        "exams": [],
        "professional_designations": None
    }
  
    print(f"\tExtracting data for {name} (CRD#: {crd})...")
    try:
        with requests.get(url, stream=True) as response:
            print(f"\tRequesting the URL...")
            response.raise_for_status()
            print(f"\tReceiving the response from the request...")
            
            

            # Open and read the PDF file
            pdf_file = BytesIO(response.content)
            print(f"\tReading pdf using BytesIO...")

            pdf_reader = PdfReader(pdf_file)
            print(f"\tReading by PdfReader...")

            # Extract text from all pages
            text_all = " ".join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])
            print(f"\tReading the entire text...")
            # print(text_all)

            # Full name
            result["name"] = name

            # CRD number
            result["crd"] = crd
            result["url"] = url

            # Summary page
            summary_page_text = pdf_reader.pages[2].extract_text()
            print(f"\tReading summary page...")
            # print(summary_page_text)

            # Current Broker
            current_brokers_block = re.search(r'Currently employed by and registered with the\nfollowing Firm\(s\)\:\n((?:.+\n)+Registered with this firm since: \d{2}/\d{2}/\d{4}(?:B|IA))Report',
                                            summary_page_text)
            print(f"\tWorking on the current broker...")
            # print(current_brokers_block.group(1))

            if current_brokers_block:
                current_brokers = re.findall(r'((?:.+\n){4,7}(?:.+\d{2}/\d{2}/\d{4}(?:B|IA)))\n?',
                                            current_brokers_block.group(1))

                if current_brokers:
                    for current_broker in current_brokers:
                        current_broker_items = re.search(r'((?:.+\n)*?)((?:\d+.+\n)(?:.+\n)*?)(.+),\s*([A-Z]{2})\s*\d{5}\n(?:CRD#\s*(\d+)\n)(?:.+)(\d{2}/\d{2}/\d{4})(B|IA)',
                                                current_broker)

                        if current_broker_items:
                            result["registration_history"].append({
                                "beg_date": current_broker_items.group(6).strip(),
                                "end_date": "",
                                "broker": current_broker_items.group(1).strip().replace('\n', ' '),
                                "crd_firm": current_broker_items.group(5).strip(),
                                "role_class": current_broker_items.group(7).strip(),
                                "street": current_broker_items.group(2).strip().replace('\n', ' '),
                                "city": current_broker_items.group(3).strip(),
                                "state": current_broker_items.group(4).strip()
                            })


            # Former Broker
            fmr_brokers_block = re.search(r'securities firm\(s\):\n((?:.+\n)+.+)www\.finra\.org',
                                        summary_page_text)
            print(f"\tWorking on the former broker...")

            if fmr_brokers_block:
                fmr_brokers = re.findall(r'(?:.+\n){3,6}\d{2}/\d{4}\s*-\s*\d{2}/\d{4}(?:B|IA)',
                                        fmr_brokers_block.group(1))

                if fmr_brokers:
                    for fmr_broker in fmr_brokers:
                        fmr_broker_items = re.search(r'((?:.+\n){1,4})CRD#\s*(\d+)\n(.+),\s*([A-Z]{2})\n(\d{2}/\d{4})\s*-\s*(\d{2}/\d{4})(B|IA)',
                                                    fmr_broker)

                        if fmr_broker_items:
                            result["registration_history"].append({
                                    "beg_date": fmr_broker_items.group(5).strip(),
                                    "end_date": fmr_broker_items.group(6).strip(),
                                    "broker": fmr_broker_items.group(1).strip().replace('\n', ' '),
                                    "crd_firm": fmr_broker_items.group(2).strip(),
                                    "street": "",
                                    "role_class": fmr_broker_items.group(7).strip(),
                                    "city": fmr_broker_items.group(3).strip(),
                                    "state": fmr_broker_items.group(4).strip()
                            })

            # Exam table
            exams_table = re.search(r'SRO or state\nregistration\.\n(.+\n(?:.+\n)+)Additional information about the above',
                                    text_all)
            print(f"\tLine #116: Working on the exam part...")
            # print(exams_table.group(1))

            # Exam tables: extract each group and each line
            if exams_table: 
                footnote = f"�2025 FINRA. All rights reserved. Report about {name}. www.finra.org/brokercheck"
                title = r"User GuidanceBroker Qualifications\nIndustry Exams this Broker has Passed, continued"
                exams_table_wo_footnote = exams_table.group(1).replace(footnote, '')
                exams_by_type_all = re.search(r'Principal/Supervisory Exams\n((?:.+\n)*?)Exam Category DateGeneral Industry/Product Exams\n((?:.+\n)*?)Exam Category DateState Securities Law Exams\n((?:.+\n?)*?)(?=\n|$)', exams_table_wo_footnote)
                print(f"\tLine #122: Extracting the exams table...")
                print(exams_by_type_all.group(3))

            #     if exams_by_type_all:
            #         for i, exams_by_type in enumerate(exams_by_type_all.groups()):
            #             if i == 0: exam_type = "Principal/Supervisory Exams"
            #             elif i == 1: exam_type = "General Industry/Product Exams"
            #             else: exam_type = "State Securities Law Exams"

            #             # Exams by each group
            #             exams = re.findall(r'((?:.*?\n?)*?\d{2}/\d{2}/\d{4}.*?(?:BIA|B|IA))',exams_by_type)
            #             print(f"\tExtracting the exams by each group {i}...")
            #             # print(exams)

                #     if exams:
                #         for exam in exams:
                #             exam_items = re.search(r'((?:.*?\n?)*?)(\d{2}/\d{2}/\d{4})(.*?)(BIA|B|IA)', exam)
                #             if exam_items:
                #                 result["exams"].append({"exam_type": exam_type,
                #                                         "exam_title": exam_items.group(1).strip().replace('\n', ' '),
                #                                         "exam_category": exam_items.group(3).strip().replace('\n', ' '),
                #                                         "exam_date": exam_items.group(2).strip(),
                #                                         "exam_class": exam_items.group(4).strip()
                #                                         })
                #     else:
                #         result["exams"].append({"exam_type": exam_type,
                #                                 "exam_title": None,
                #                                 "exam_category": None,
                #                                 "exam_date": None,
                #                                 "exam_class": None
                #                                 })

                # # Professional Designations from page 4
                # designation_match = re.search(r'This section details that the representative has reported (\d+) professional designation\(s\)\.(.*?)\s+This representative holds or did hold (\d+) professional designation',
                #                             text_all, re.DOTALL)
                # print(f"\tWorking on the professional designations part...")
                # if designation_match:
                #     result["professional_designations"] = designation_match.group(2).strip()

    #     return result

    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDF: {e}")
        return result

analys = "125"
analys_name = "COHEN J"
logging.getLogger('PyPDF2').setLevel(logging.ERROR)


filtered_result = {'name': 'BERNARD J COHEN', 'url': 'https://files.brokercheck.finra.org/individual/individual_49772.pdf', 'crd': '49772'}

extract_brokercheck_data(**filtered_result)



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
	Extracting data for BERNARD J COHEN (CRD#: 49772)...
	Requesting the URL...
	Receiving the response from the request...
	Reading pdf using BytesIO...
	Reading by PdfReader...
	Reading the entire text...
	Reading summary page...
	Working on the current broker...
	Working on the former broker...
	Line #116: Working on the exam part...
	Line #122: Extracting the exams table...
Uniform Investment Adviser Law Examination 05/04/2005 Series 65 IA
Uniform Securities Agent State Law Examination 04/03/1981 Series 63 B6 �2025 FINRA. All rights reserved. Report about BERNARD J. COHEN. www.finra.org/brokercheckUser GuidanceBroker Qualifications
Industry Exams this Broker has Passed, continued



In [7]:
text_all

'BrokerCheck ReportBERNARD J COHENSection TitleReport SummaryBroker QualificationsRegistration and Employment HistoryDisclosure EventsCRD# 49772\n1\n2 - 7\n9 - 10\n11Page(s)\nWhen communicating online or investing with any professional, make sure you know who you\'re dealing with.\nPlease contact FINRA with any concerns.i Imposters might link to sites like BrokerCheck\nfrom phishing or similar scam websites, or through social media , trying to steal your personal information or your money. About BrokerCheck®\nBrokerCheck offers information on all current, and many former, registered securities brokers, and all current and former\nregistered securities firms. FINRA strongly encourages investors to use BrokerCheck to check the background of\nsecurities brokers and brokerage firms before deciding to conduct, or continue to conduct, business with them.\n·What is included in a BrokerCheck report?\n· BrokerCheck reports for individual brokers include information such as employment history, p

<h1><b>Trash hereafter.</b> </h1>


---



In [None]:
import re
import PyPDF2
from typing import Dict, Any, List, Tuple
import io
import pandas as pd

def extract_brokercheck_data(pdf_file_path: str) -> Dict[str, Any]:
    """
    Extract specific data from a FINRA BrokerCheck PDF report.

    Args:
        pdf_file_path: Path to the PDF file

    Returns:
        Dictionary containing the extracted data
    """
    # Initialize result dictionary
    result = {
        "full_name": None,
        "crd": None,
        "exams": [],
        "professional_designations": [],
        "registration_history": []
    }

    # Open and read the PDF file
    with open(pdf_file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        # Extract information from cover page (page 1)
        cover_page_text = pdf_reader.pages[0].extract_text()

        # Extract full name
        name_match = re.search(r'Report(.*?)Section TitleReport SummaryBroker', cover_page_text).group(1).strip()
        if name_match:
            result["full_name"] = name_match

        # Extract CRD number
        crd_match = re.search(r'CRD#\s+(\d+)', cover_page_text)
        if crd_match:
            result["crd"] = crd_match.group(1)

        # Extract exam information from page 3
        exams_page_text = pdf_reader.pages[4].extract_text()

        # Extract exam tables
        ps_header = r'Exam Category DatePrincipal/Supervisory Exams'
        gip_header = r'Exam Category DateGeneral Industry/Product Exams'
        ssl_header = r'Exam Category DateState Securities Law Exams'

        tables_pattern = ps_header+r'\n(.*?)\n'+gip_header+r'\n(.*?)\n'+ssl_header+r'\n(.*?)\nAdditional information'
        exams_tables = re.search(tables_pattern, exams_page_text, re.DOTALL)

        if exams_tables:
            for exams_table in exams_tables.groups():
                for exam in exams_table.split("\n"):
                    exam_items = re.search(r'(.*?)\s+(\d{2}/\d{2}/\d{4})\s+(.*?)\s+([B|IA])', exam)
                    # print(exam_items.group(4))
                    result["exams"].append({
                                            "type": exam_items.group(4),
                                            "exam_title": exam_items.group(1),
                                            "exam_category": exam_items.group(3),
                                            "exam_date": exam_items.group(2)
                                            })
    return result


In [None]:
def scrape_broker_page(url: str) -> Dict[str, Any]:
    """
    Scrape a broker's page for detailed information

    Args:
        url: The URL of the broker's page

    Returns:
        Dictionary containing broker details
    """
    # Endpoint for the scrape API
    endpoint = f"{base_url}/v1/scrape"

    # Prepare the payload
    payload = {
        "url": url,
        "formats": ["markdown"],
        "waitFor": 5000  # Wait 5 seconds for page to load
    }

    try:
        # Make the request to Firecrawl
        response = requests.post(
            endpoint,
            headers=headers,
            json=payload
        )
        response.raise_for_status()

        # Return the scraped content
        return response.json()

    except requests.RequestException as e:
        print(f"API request error: {e}")
        return {}
    except json.JSONDecodeError:
        print("Error parsing API response")
        return {}
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {}

url = 'https://brokercheck.finra.org/individual/summary/2744439'
scrape_result = scrape_broker_page(url)
print(scrape_result['data']['markdown'])

[![broker-check-logo](https://brokercheck.finra.org/assets/images/bc_logo_large.png)](https://brokercheck.finra.org/)

[![broker-check-logo](https://brokercheck.finra.org/assets/images/bc_logo_large.png)](https://brokercheck.finra.org/)

[FEEDBACK](mailto:BrokerCheck@finra.org)

BrokerCheck Help Line (800) 289-9999

Schedule a Call

[FINRA Home](https://www.finra.org/)

[![broker-check-logo](https://brokercheck.finra.org/assets/images/bc_logo_large.png)](https://brokercheck.finra.org/)

- Individual

- Firm

- By clicking the SEARCH button or otherwise using BrokerCheck, I agree to BrokerCheck Terms of Use


Individual Name/CRD# (required)

at

Firm Name or CRD/SEC# (optional)

in

City, State or ZIP (optional)

Search

Firm Name or CRD/SEC# (optional)

in

City, State or ZIP (optional)

Search

By clicking the SEARCH button or otherwise using BrokerCheck,  I agree to BrokerCheck Terms of Use

When communicating online or investing with any professional, make sure you know who you’re d

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

search_term = "BOSSHARD E"


base_url = "https://brokercheck.finra.org"
search_url = f"{base_url}/search/results"
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://brokercheck.finra.org/",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-User": "?1"
        }

encoded_search = requests.utils.quote(search_term)
search_url = f"{search_url}?searchValue={encoded_search}"

search_url
# Create a Session instance
# session = requests.Session()

# response = session.get(url = search_url, headers=headers)
# response.raise_for_status()

# if response.status_code == 200:
#     print("Search request successful")
#     html_content = response.text
# else:
#     logger.error(f"Search request failed with status code: {response.status_code}")

# results = []
# soup = BeautifulSoup(html_content, 'html.parser')

# result_elements = soup.select('div.search-result-item')

# if not result_elements:
#     print("No result elements found or selector needs updating")

#     broker_links = soup.find_all('a', href=re.compile(r'/individual/summary/\d+'))

#     for link in broker_links:
#           broker_id = link['href'].split('/')[-1]
#           broker_name = link.get_text(strip=True)

#           if broker_name and broker_id:
#               results.append({
#                   'name': broker_name,
#                   'id': broker_id,
#                   'url': f"{self.base_url}{link['href']}"
#               })

# else:
#       # Parse structured result elements
#     for element in result_elements:
#         name_elem = element.select_one('.broker-name')
#         id_elem = element.select_one('.broker-id')

#         if name_elem and id_elem:
#             broker_name = name_elem.get_text(strip=True)
#             broker_id = id_elem.get_text(strip=True).replace('CRD#', '').strip()
#             broker_url = f"{self.base_url}/individual/summary/{broker_id}"

#             results.append({
#                 'name': broker_name,
#                 'id': broker_id,
#                 'url': broker_url
#             })
# logger.info(f"Found {len(results)} results")




'https://brokercheck.finra.org/search/results?searchValue=BOSSHARD%20E'