<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/Procore_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import json
import pandas as pd

def fetch_company_data(company_name, state=''):
    """
    Fetch company data based on company name and state.

    Args:
        company_name (str): The name of the company to search for.
        state (str): The state code to filter by (optional).

    Returns:
        pd.DataFrame: A pandas DataFrame containing the company data.
    """

    # Base URL with placeholders for company name and state
    url = f"https://pcn.procore.com/api/search?q={company_name}&state={state}&country=USA&sort=relevance&page={{}}&pageSize=1000"

    # Initialize variables
    page = 1
    all_companies = []

    # Function to extract required fields from each company
    def extract_company_info(company):
        # Extract the primary address (assuming the first one is the main office)
        address = company.get('addresses', [{}])[0]

        # Combine coverage areas (all existing states)
        coverage_areas = ', '.join([area.get('admin1', '').upper() for area in company.get('coverageAreas', [])])

        # Clean provided services (trades and specialties) by replacing underscores with spaces
        trades = [service['key'].replace('_', ' ') for service in company.get('providedServices', []) if service['level'] == 'trade']
        specialties = [service['key'].replace('_', ' ') for service in company.get('providedServices', []) if service['level'] == 'specialty']

        # Extract relevant fields
        return {
            'companyId': company.get('companyId'),
            'companyName': company.get('name'),
            'fullAddress': f"{address.get('address1', '')}, {address.get('city', '')}, {address.get('province', '')}, {address.get('postalCode1', '')}, {address.get('countryCode', '')}",
            'phone': company.get('phone'),
            'website': company.get('website'),
            'specialties': ', '.join(specialties),
            'trades': ', '.join(trades),
            'constructionSectors': ', '.join(company.get('constructionSectors', [])),
            'coverageAreas': coverage_areas,
            'updatedAt': company.get('updatedAt'),
            'companySize': company.get('companySize')
        }

    # Fetch results until the count is less than 1000
    while True:
        # Make the request
        res = requests.get(url.format(page))
        data = res.json()

        # Store the results
        companies = data.get('results', [])

        # Process and store each company's info
        for company in companies:
            company_info = extract_company_info(company)
            all_companies.append(company_info)

        # Break the loop if less than 1000 results are returned
        if len(companies) < 1000:
            break

        # Move to the next page
        page += 1

    # Save the results in a JSON file
    with open('companies.json', 'w') as json_file:
        json.dump(all_companies, json_file, indent=4)

    # Load the JSON into a pandas DataFrame
    df = pd.DataFrame(all_companies)

    return df

In [None]:
fetch_company_data("Ozark", "")

Unnamed: 0,companyId,companyName,fullAddress,phone,website,specialties,trades,constructionSectors,coverageAreas,updatedAt,companySize
0,598134300000000.0,Ozark Mechanical,"286 Ridgewood Dr, Lake Ozark, MO, 65049, US",+1-636-317-1234,https://ozarkmechanical.com/,,heating ventilating and air conditioning hvac,"commercial, industrial_and_energy, residential","MO, MO, MO, MO, MO",2023-05-02T20:07:21.280Z,
1,598134300000000.0,OZARK CONSTRUCTION GROUP,"1350 Scenic Hwy N, Snellville, GA, 30078, US",+14043846486,,,rough carpentry,"commercial, residential","GA, TN, AL, NC, SC",2024-01-11T23:02:36.161Z,
2,598134300000000.0,Ozark Fire Sprinkler,"2312 Industrial Dr, Columbia, MO, 65202, US",+1-573-682-4465,,,fire suppression,"commercial, healthcare, industrial_and_energy,...","MO, MO, MO, MO, MO, MO, MO, IL, MO, MO, MO, MO...",2023-08-02T18:04:49.750Z,
3,598134300000000.0,Ozark Bldg Materials,"2833 Breckenridge Industrial Ct, St. Louis, MO...",+13143731003,http://www.bmcenterprises.com,,concrete,"commercial, healthcare, industrial_and_energy,...","MO, IL",2023-06-16T21:11:41.629Z,
4,562950000000000.0,Ozark Fire Sprinkler,"19098 Dwyer Rd, Warsaw, MO, 65355, US",+1-660-438-5701,https://www.ozarkfs.com,,fire suppression,"commercial, healthcare, industrial_and_energy,...","MO, AR, IL, KS",2022-11-21T18:39:12.705Z,
5,598134300000000.0,Ozark Mountain Glass,"2231 Lowell Rd, Springdale, AR, 72764, US",+14794195551,,,project management and coordination,"commercial, healthcare, industrial_and_energy,...",AR,2024-03-06T22:06:13.196Z,
6,598134300000000.0,OZARK CONSTRUCTION GROUP,"1350 Scenic Hwy N, Snellville, GA, 30078, US",+14043846486,,,rough carpentry,"commercial, residential","GA, TN, AL, NC, SC",2024-02-17T17:00:45.437Z,"{'start': 51, 'end': 100}"
7,598134300000000.0,Ozark Ready Mix,"4400 College Blvd, Overland Park, KS, 66211, US",+19137883165,https://ozarkreadymix.com/,,concrete,commercial,"MO, KS, AR",2023-08-30T16:01:09.668Z,
8,598134300000000.0,Ozarks Millworks,"1469 W Skyline Ave, Ozark, MO, 65721, US",+14178613193,,,rough carpentry,commercial,"MO, AR, OK, KS",2023-12-19T22:06:30.662Z,
9,598134300000000.0,Ozark Roofing and Construction,"6368 49th St N, Pinellas Park, FL, 33781, US",+17272221863,,,roofing,"commercial, residential",FL,2024-03-07T17:24:42.763Z,
