In [1]:
!pip install requests beautifulsoup4 pandas openpyxl




In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the Yellow Pages search result
url = 'https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York%2C+NY'

# Send a request to the Yellow Pages website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize lists to store the scraped data
    last_names = []
    first_names = []
    job_titles = []
    work_phones = []
    emails = []  # Most likely won't be available
    companies = []
    addresses = []
    valid_status = []
    contact_accuracy_scores = []

    # Find all business listings on the page
    businesses = soup.find_all('div', class_='result')

    # Iterate through each business listing and extract details
    for business in businesses:
        # Extract Name - for now just one field, Yellow Pages usually does not have split names
        name = business.find('a', class_='business-name')
        if name:
            name_parts = name.text.strip().split(' ')
            first_names.append(name_parts[0] if len(name_parts) > 0 else 'N/A')
            last_names.append(name_parts[-1] if len(name_parts) > 1 else 'N/A')
        else:
            first_names.append('N/A')
            last_names.append('N/A')

        # Extract Job Title (can be company-related field in YP listings)
        job_title = business.find('div', class_='categories')
        job_titles.append(job_title.text.strip() if job_title else 'N/A')

        # Work Phone Number
        work_phone = business.find('div', class_='phones phone primary')
        work_phones.append(work_phone.text.strip() if work_phone else 'N/A')

        # Email (Yellow Pages likely won't have this)
        emails.append('N/A')

        # Company Information
        company = business.find('span', class_='business-name')
        companies.append(company.text.strip() if company else 'N/A')

        # Address
        address = business.find('div', class_='street-address')
        addresses.append(address.text.strip() if address else 'N/A')

        # Assume valid if listed
        valid_status.append(True)

        # Placeholder for contact accuracy score (assume default or calculate later)
        contact_accuracy_scores.append(90)  # Default value

    # Create a DataFrame to organize the data
    data = {
        'FirstName': first_names,
        'LastName': last_names,
        'JobTitle': job_titles,
        'WorkPhoneNumber': work_phones,
        'Email': emails,
        'Company': companies,
        'Address': addresses,
        'Valid': valid_status,
        'ContactAccuracyScore': contact_accuracy_scores,
    }

    df = pd.DataFrame(data)

    # Save the DataFrame to an Excel file
    df.to_excel('yellow_pages_contacts.xlsx', index=False)

    print('Scraping completed. Data has been saved to yellow_pages_contacts.xlsx.')
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Scraping completed. Data has been saved to yellow_pages_contacts.xlsx.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, HTML
from time import sleep

# Define headers to mimic a real browser request to avoid blocking
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Define a list of locations you want to scrape
locations = ['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Houston, TX', 'Miami, FL']

# Base URL of Yellow Pages
base_url = 'https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms='

# Initialize lists to store the scraped data across multiple locations
first_names, last_names, job_titles, work_phones, emails, companies, addresses, valid_status, contact_accuracy_scores = [], [], [], [], [], [], [], [], []

# Function to scrape a single page
def scrape_page(soup):
    businesses = soup.find_all('div', class_='result')

    for business in businesses:
        # Extract Name
        name = business.find('a', class_='business-name')
        if name:
            name_parts = name.text.strip().split(' ')
            first_names.append(name_parts[0] if len(name_parts) > 0 else 'N/A')
            last_names.append(name_parts[-1] if len(name_parts) > 1 else 'N/A')
        else:
            first_names.append('N/A')
            last_names.append('N/A')

        # Extract Job Title
        job_title = business.find('div', class_='categories')
        job_titles.append(job_title.text.strip() if job_title else 'N/A')

        # Work Phone Number
        work_phone = business.find('div', class_='phones phone primary')
        work_phones.append(work_phone.text.strip() if work_phone else 'N/A')

        # Email (Yellow Pages likely won't have this)
        emails.append('N/A')

        # Company Information
        company = business.find('span', class_='business-name')
        companies.append(company.text.strip() if company else 'N/A')

        # Address
        address = business.find('div', class_='street-address')
        addresses.append(address.text.strip() if address else 'N/A')

        # Assume valid if listed
        valid_status.append(True)

        # Placeholder for contact accuracy score (assume default or calculate later)
        contact_accuracy_scores.append(90)  # Default value

# Function to scrape a location
def scrape_location(location):
    page = 1
    while True:
        # Construct the URL for the current page
        url = f'{base_url}{location.replace(" ", "+")}&page={page}'
        print(f"Scraping {url}...")

        # Send a request to the website
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Scrape the current page
            scrape_page(soup)

            # Check if there's a "Next" button (to determine if more pages exist)
            next_button = soup.find('a', class_='next ajax-page')
            if next_button:
                page += 1  # Go to the next page
                sleep(2)  # Add delay between requests to avoid getting blocked
            else:
                break  # No more pages, exit loop
        else:
            print(f"Failed to retrieve page {page} for location {location}. Status code: {response.status_code}")
            break

# Loop through all the locations
for location in locations:
    scrape_location(location)

# Create a DataFrame with all the scraped data
data = {
    'FirstName': first_names,
    'LastName': last_names,
    'JobTitle': job_titles,
    'WorkPhoneNumber': work_phones,
    'Email': emails,
    'Company': companies,
    'Address': addresses,
    'Valid': valid_status,
    'ContactAccuracyScore': contact_accuracy_scores,
}

df = pd.DataFrame(data)

# Save the DataFrame to an Excel file in Google Colab's environment
excel_file = '/content/yellow_pages_contacts_full.xlsx'
df.to_excel(excel_file, index=False)

print(f'Scraping completed. Data has been saved to {excel_file}.')

# Display the DataFrame in the Colab output
display(HTML(df.to_html()))

# Optionally: Display a download link for the Excel file
from google.colab import files
files.download(excel_file)


Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=1...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=2...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=3...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=4...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=5...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=6...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=7...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=8...
Scraping https://www.yellowpages.com/search?search_terms=consultants&geo_location_terms=New+York,+NY&page=9...
S

Unnamed: 0,FirstName,LastName,JobTitle,WorkPhoneNumber,Email,Company,Address,Valid,ContactAccuracyScore
0,OvationMR,,Internet Marketing & AdvertisingInternet Products & ServicesAdvertising Specialties,(347) 756-5145,,,39 Broadway,True,90
1,Barefoot,Inc,Business Coaches & ConsultantsMusical Instruments,(833) 840-0774,,,451 37th Street,True,90
2,Wilder's,Wordprocessing,Communications ServicesDesktop Publishing ServiceBusiness Documents & Records-Storage & Management,(844) 302-3103,,,"500 W End Ave, Ste 4E",True,90
3,Ali,Makeup,Wedding Supplies & ServicesTheatrical Make-UpBeauty Salons,(707) 200-8612,,,,True,90
4,Off,Hotel,Business Coaches & ConsultantsBusiness ManagementLodging,(212) 353-0860,,,11 Rivington St,True,90
5,Jonathan,Inc,Business Coaches & Consultants,(212) 924-9691,,,250 W 19th St Apt 15c,True,90
6,Weiser,Consulting,Business Coaches & ConsultantsMarketing ConsultantsAccountants-Certified Public,(212) 375-6565,,,1 Penn Plz,True,90
7,Geneva,Inc,Business Coaches & Consultants,(212) 643-9530,,,1350 Broadway Rm 403,True,90
8,Meadowbrook,LTD,Business Coaches & Consultants,(212) 233-5688,,,277 Broadway,True,90
9,Hunt,Consulting,Business Coaches & Consultants,(212) 340-1137,,,244 5th Ave,True,90


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
!git clone https://github.com/balachikkala006/web_scrapping.git


Cloning into 'web_scrapping'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [6]:
%cd web_scrapping


/content/web_scrapping


In [7]:
!cp /content/web_scraping.ipynb ./


cp: cannot stat '/content/web_scraping.ipynb': No such file or directory


In [8]:
!ls /content


sample_data  web_scrapping  yellow_pages_contacts_full.xlsx  yellow_pages_contacts.xlsx


In [12]:
!cp /content/web_scrapping ./


cp: -r not specified; omitting directory '/content/web_scrapping'


In [13]:
!git add .


In [14]:
!git commit -m "Scrapping is done for the website"


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@5e70094f2e62.(none)')


In [15]:
!git push origin main  # Change 'main' to 'master' if that's your branch


fatal: could not read Username for 'https://github.com': No such device or address


In [16]:
# Clone your repository
!git clone https://github.com/balachikkala006/web_scrapping.git

# Navigate to the repository directory
%cd web_scrapping

# Copy your Colab files to the repo
!cp /content/web_scraping.ipynb ./

# Add changes
!git add .

# Commit changes
!git commit -m "Scrapping is done for the website"

# Push to GitHub
!git push origin main  # or master if that's your default branch


Cloning into 'web_scrapping'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/content/web_scrapping/web_scrapping
cp: cannot stat '/content/web_scraping.ipynb': No such file or directory
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@5e70094f2e62.(none)')
fatal: could not read Username for 'https://github.com': No such device or address


In [19]:
!git config --global user.email "balachikkala.01@gmail.com"
!git config --global user.name "balachikkala006"

In [20]:
# Commit changes
!git commit -m "Scrapping is done for the website"

# Push to GitHub
!git push origin main  # or master if that's your default branch

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
fatal: could not read Username for 'https://github.com': No such device or address
