In [33]:
import os
import pandas as pd
import requests
import zipfile 
import re
from bs4 import BeautifulSoup

In [34]:
r = requests.get('https://download.cms.gov/nppes/NPI_Files.html')

In [35]:
r.text

'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd">\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<style type="text/css">\n\t.pStyle{\n\t\tpadding-left: 20px; margin: 0px;\n\t}\n    a:hover {\n\tcolor: rgb(255, 120, 0); text-decoration: underline;\n    }\n    .mainbox {\n\tmargin: 0px 0.5em; padding: 0px; border: 1px solid currentColor; font-family: "Lucida Grande", Verdana, Arial, Helvetica, sans-serif; font-size: 1em;\n    }\n    .mainbox h2 {\n\tpadding-left: 5px; font-size: 1em; margin:5px 0px 30px 0px;\n\t}\n    .header-background {\n\tbackground-image: url(\'images/Top_Banner1A.png\'); background-position: center; background-size: contain; height: 44px; align-items: center; display: flex; justify-content: flex-start; margin-top: 15px;\n\t}\n    .header-background h1 {\n\tfont-size: 2em; color: white; font-family: "Georgia"; font-weight: normal;\n    

In [36]:
soup = BeautifulSoup(r.text, 'html.parser')

# Find all zips
links = soup.find_all(href=True)
href_values = [os.path.basename(link['href']) for link in links if link['href'][-3:] == 'zip']

for href in href_values:
    print(href)

NPPES_Data_Dissemination_March_2025.zip
NPPES_Deactivated_NPI_Report_031025.zip
NPPES_Data_Dissemination_030325_030925_Weekly.zip
NPPES_Data_Dissemination_031025_031625_Weekly.zip
NPPES_Data_Dissemination_031725_032325_Weekly.zip
NPPES_Data_Dissemination_March_2025_V2.zip
NPPES_Deactivated_NPI_Report_031025_V2.zip
NPPES_Data_Dissemination_030325_030925_Weekly_V2.zip
NPPES_Data_Dissemination_031025_031625_Weekly_V2.zip
NPPES_Data_Dissemination_031725_032325_Weekly_V2.zip


In [46]:
base_url = 'https://download.cms.gov/nppes/'

# now concat them to basepath and configurte output folder
outdir = '../data/'
if not os.path.isdir(outdir):
    os.mkdir(outdir)


In [47]:
# grab the last one as a test and save it to outidr. just loop through and save them.
# again not sure what struture you want and also at this point the files inside the zips could prob be condensed and reformmatted.
# The monthly file is huge so might need to do some delta reconcilation as you don't want to download the monthly file every day or week
r2 = requests.get(os.path.join(base_url, href_values[0]))
filename = os.path.join(outdir, href_values[0])
with open(filename, 'wb') as f:  
    f.write(r2.content)

In [48]:
# Extract the zip file
with zipfile.ZipFile(filename, 'r') as zip_ref:
    # List all files in the zip
    extracted_files = zip_ref.namelist()

    # Filter for CSV files
    csv_files = [file for file in extracted_files if file.endswith('.csv')]

    # Find the largest CSV file by comparing file sizes
    largest_csv_file = max(csv_files, key=lambda file: zip_ref.getinfo(file).file_size)

In [51]:
# Load the largest CSV into a pandas DataFrame
csv_path = os.path.join(outdir, largest_csv_file)
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extract(largest_csv_file, outdir)  # Extract the largest CSV file

# Define dtype for the phone number column as string to avoid scientific notation
dtype_spec = {
    'Provider Business Mailing Address Telephone Number': str
}


# df = pd.read_csv(csv_path)


In [55]:
import psutil
# Read a small sample
df_sample = pd.read_csv(csv_path, nrows=10000)

# Calculate memory usage per row
memory_usage_per_row = df_sample.memory_usage(deep=True).sum() / len(df_sample)
print(f"Estimated memory usage per row: {memory_usage_per_row:.2f} bytes")

# Get available system memory
available_memory = psutil.virtual_memory().available  # In bytes

# Allow using about 30% of available memory
max_memory_usage = available_memory * 0.3  

# Estimate the best chunk size
chunksize = int(max_memory_usage / memory_usage_per_row)
print(f"Recommended chunksize: {chunksize:,} rows")


  df_sample = pd.read_csv(csv_path, nrows=10000)


Estimated memory usage per row: 7885.36 bytes
Recommended chunksize: 92,866 rows


In [56]:
# Process in chunks
chunk_size = 100000  # Adjust based on available RAM
filtered_chunks = []

for chunk in pd.read_csv(csv_path, dtype=dtype_spec, usecols=columns_needed, chunksize=chunk_size):
    # Remove invalid phone numbers
    chunk = chunk[chunk['Provider Business Mailing Address Telephone Number'].str.strip() != 'nan']

    # Filter states
    chunk = chunk[chunk['Provider Business Mailing Address State Name'].isin(valid_states)]

    # Drop rows with missing values in key columns
    chunk = chunk.dropna(subset=['Provider Last Name (Legal Name)', 'Provider Business Mailing Address Telephone Number'])

    # Append processed chunk to list
    filtered_chunks.append(chunk)

# Concatenate all chunks (this avoids loading entire CSV at once)
df_filtered = pd.concat(filtered_chunks, ignore_index=True)



In [62]:
# Save the filtered data to a new CSV
df_filtered.to_csv(os.path.join(outdir, 'output.csv'), index=False)

In [63]:
output_csv_path = os.path.join(outdir, 'output.csv')
chunksize = 62_500  # Split into 1 million rows per file

# Write in chunks
for i, chunk in enumerate(pd.read_csv(output_csv_path, chunksize=chunksize)):
    chunk_file = os.path.join(outdir, f'output_part_{i+1}.csv')
    chunk.to_csv(chunk_file, index=False)
    print(f"Saved: {chunk_file} with {len(chunk):,} rows")

Saved: ../data/output_part_1.csv with 62,500 rows
Saved: ../data/output_part_2.csv with 62,500 rows
Saved: ../data/output_part_3.csv with 62,500 rows
Saved: ../data/output_part_4.csv with 62,500 rows
Saved: ../data/output_part_5.csv with 62,500 rows
Saved: ../data/output_part_6.csv with 62,500 rows
Saved: ../data/output_part_7.csv with 62,500 rows
Saved: ../data/output_part_8.csv with 62,500 rows
Saved: ../data/output_part_9.csv with 62,500 rows
Saved: ../data/output_part_10.csv with 62,500 rows
Saved: ../data/output_part_11.csv with 62,500 rows
Saved: ../data/output_part_12.csv with 62,500 rows
Saved: ../data/output_part_13.csv with 62,500 rows
Saved: ../data/output_part_14.csv with 62,500 rows
Saved: ../data/output_part_15.csv with 62,500 rows
Saved: ../data/output_part_16.csv with 62,500 rows
Saved: ../data/output_part_17.csv with 62,500 rows
Saved: ../data/output_part_18.csv with 62,500 rows
Saved: ../data/output_part_19.csv with 62,500 rows
Saved: ../data/output_part_20.csv with 6

In [32]:
import random


business_types = [
    "chiropractor", "massage therapist", "acupuncturist", "physical therapist",
    "speech therapist", "dentist", "family doctor", "pediatrician", "psychologist",
    "psychiatrist", "licensed counselor", "home healthcare provider", "midwife",
    "personal trainer", "nutritionist", "lawyer", "attorney", "notary public",
    "accountant", "CPA", "tax preparer", "financial advisor", "insurance broker",
    "mortgage broker", "property manager", "home inspector",
    "plumber", "electrician", "carpenter", "general contractor", "HVAC technician",
    "roofing contractor", "landscaper", "painter", "handyman", "pool maintenance technician",
    "welder", "locksmith", "barber", "hair stylist", "nail technician", "esthetician",
    "makeup artist", "tattoo artist", "spa owner", "eyelash technician", "coffee shop owner",
    "bakery owner", "food truck owner", "restaurant owner", "caterer", "butcher",
    "boutique owner", "florist", "truck driver", "courier", "moving company owner",
    "freelance graphic designer", "marketing consultant", "social media manager",
    "content creator", "copywriter", "editor", "SEO specialist", "photographer",
    "videographer", "IT consultant", "web developer", "software developer",
    "UX/UI designer", "event planner", "wedding planner", "DJ", "live musician",
    "private tutor", "music teacher", "dance instructor", "business consultant",
    "motivational speaker", "language instructor", "martial arts instructor",
    "dog walker", "pet groomer", "dog trainer", "travel agent", "tour guide",
    "interior designer", "professional organizer", "house cleaner",
    "carpet cleaner", "pest control technician", "junk removal specialist",
    "home security consultant", "auto mechanic", "car detailing specialist",
    "auto body repair specialist", "tow truck operator", "driving instructor",
    "rideshare driver", "farmer", "beekeeper", "greenhouse owner", "arborist",
    "optometrist", "occupational therapist", "speech-language pathologist", "dermatologist",
    "orthodontist", "naturopathic doctor", "podiatrist", "veterinary doctor", "audiologist",
    "clinical laboratory technician", "radiologic technologist", "phlebotomist",
    "medical transcriptionist", "medical billing & coding specialist", "hypnotherapist",
    "paralegal", "court reporter", "mediator", "estate planner", "wealth manager",
    "credit repair specialist", "real estate investor", "home stager",
    "structural engineer", "surveyor", "solar panel installer", "property appraiser",
    "excavation contractor", "masonry contractor", "flooring installer", "tile setter",
    "drywall installer", "ironworker", "heavy equipment operator", "auto glass repair technician",
    "RV technician", "motorcycle mechanic", "car auctioneer", "long-haul trucker",
    "bicycle mechanic", "cosmetic dentist", "permanent makeup artist", "hair extension specialist",
    "cosmetology instructor", "image consultant", "cybersecurity consultant", "ethical hacker",
    "database administrator", "AI/ML engineer", "blockchain developer", "virtual reality (VR) developer",
    "augmented reality (AR) developer", "game developer", "robotics engineer",
    "podcaster", "life coach", "career coach", "test prep tutor",
    "college admissions consultant", "homeschool consultant", "online course creator",
    "mobile notary", "drone operator", "forensic scientist", "urban planner", "woodworker", "dog breeder"
]


valid_states = {
    'AL': ['Birmingham', 'Montgomery', 'Huntsville'],
    'AR': ['Little Rock', 'Fort Smith', 'Fayetteville'],
    'CO': ['Denver', 'Colorado Springs', 'Aurora'],
    'DE': ['Wilmington', 'Dover', 'Newark'],
    'FL': ['Jacksonville', 'Miami', 'Tampa'],
    'GA': ['Atlanta', 'Augusta', 'Columbus'],
    'IA': ['Des Moines', 'Cedar Rapids', 'Davenport'],
    'IL': ['Chicago', 'Aurora', 'Naperville'],
    'IN': ['Indianapolis', 'Fort Wayne', 'Evansville'],
    'KS': ['Wichita', 'Overland Park', 'Kansas City'],
    'KY': ['Louisville', 'Lexington', 'Bowling Green'],
    'LA': ['New Orleans', 'Baton Rouge', 'Shreveport'],
    'MD': ['Baltimore', 'Columbia', 'Germantown'],
    'MO': ['Kansas City', 'St. Louis', 'Springfield'],
    'MI': ['Detroit', 'Grand Rapids', 'Warren'],
    'MT': ['Billings', 'Missoula', 'Great Falls'],
    'NC': ['Charlotte', 'Raleigh', 'Greensboro'],
    'NE': ['Omaha', 'Lincoln', 'Bellevue'],
    'NV': ['Las Vegas', 'Henderson', 'Reno'],
    'OH': ['Columbus', 'Cleveland', 'Cincinnati'],
    'OK': ['Oklahoma City', 'Tulsa', 'Norman'],
    'SC': ['Charleston', 'Columbia', 'North Charleston'],
    'SD': ['Sioux Falls', 'Rapid City', 'Aberdeen'],
    'TN': ['Nashville', 'Memphis', 'Knoxville'],
    'TX': ['Houston', 'San Antonio', 'Dallas'],
    'UT': ['Salt Lake City', 'West Valley City', 'Provo'],
    'WI': ['Milwaukee', 'Madison', 'Green Bay'],
    'WV': ['Charleston', 'Huntington', 'Morgantown'],
    'WY': ['Cheyenne', 'Casper', 'Laramie']
}

# business_types = [
#     "financial advisor", "wealth manager", "estate planner",
#     "real estate investor", "mortgage broker", "property manager",
#     "general contractor", "plumber", "electrician", "HVAC technician",
#     "dentist", "dermatologist", "optometrist", "psychiatrist", "family doctor",
#     "lawyer", "attorney",
#     "truck driver", "long-haul trucker",
#     "entrepreneur", "e-commerce store owner"
# ]

# valid_states = {
#     'FL': ['Jacksonville', 'Miami', 'Tampa'],
#     'TX': ['Houston', 'San Antonio', 'Dallas'],
#     'CA': ['Los Angeles', 'San Diego', 'San Jose'],
#     'IA': ['Des Moines', 'Cedar Rapids', 'Davenport'],
#     'SC': ['Charleston', 'Columbia', 'North Charleston'],
#     'GA': ['Atlanta', 'Augusta', 'Columbus'],
#     'MD': ['Baltimore', 'Columbia', 'Germantown'],
#     'VA': ['Virginia Beach', 'Chesapeake', 'Arlington']
# }

# Generate and print the queries
for state, cities in valid_states.items():
    query = f"{random.choice(business_types)};{random.choice(cities)}"
    print(query)


    # valid_states = {
#     'AL': ['Birmingham', 'Montgomery', 'Huntsville', 'Mobile', 'Tuscaloosa', 'Hoover', 'Auburn'],
#     'AR': ['Little Rock', 'Fort Smith', 'Fayetteville', 'Springdale', 'Jonesboro', 'North Little Rock', 'Conway'],
#     'CO': ['Denver', 'Colorado Springs', 'Aurora', 'Fort Collins', 'Lakewood', 'Thornton', 'Arvada'],
#     'DE': ['Wilmington', 'Dover', 'Newark', 'Middletown', 'Bear', 'Brookside', 'Glasgow'],
#     'FL': ['Jacksonville', 'Miami', 'Tampa', 'Orlando', 'St. Petersburg', 'Hialeah', 'Tallahassee'],
#     'GA': ['Atlanta', 'Augusta', 'Columbus', 'Macon', 'Savannah', 'Athens', 'Sandy Springs'],
#     'IA': ['Des Moines', 'Cedar Rapids', 'Davenport', 'Sioux City', 'Iowa City', 'Waterloo', 'Ames'],
#     'IL': ['Chicago', 'Aurora', 'Naperville', 'Joliet', 'Rockford', 'Springfield', 'Elgin'],
#     'IN': ['Indianapolis', 'Fort Wayne', 'Evansville', 'South Bend', 'Carmel', 'Bloomington', 'Fishers'],
#     'KS': ['Wichita', 'Overland Park', 'Kansas City', 'Olathe', 'Topeka', 'Lawrence', 'Shawnee'],
#     'KY': ['Louisville', 'Lexington', 'Bowling Green', 'Owensboro', 'Covington', 'Richmond', 'Georgetown'],
#     'LA': ['New Orleans', 'Baton Rouge', 'Shreveport', 'Lafayette', 'Lake Charles', 'Bossier City', 'Monroe'],
#     'MD': ['Baltimore', 'Columbia', 'Germantown', 'Silver Spring', 'Waldorf', 'Ellicott City', 'Frederick'],
#     'MO': ['Kansas City', 'St. Louis', 'Springfield', 'Columbia', 'Independence', 'Lee’s Summit', 'O’Fallon'],
#     'MI': ['Detroit', 'Grand Rapids', 'Warren', 'Sterling Heights', 'Ann Arbor', 'Lansing', 'Flint'],
#     'MT': ['Billings', 'Missoula', 'Great Falls', 'Bozeman', 'Butte', 'Helena', 'Kalispell'],
#     'NC': ['Charlotte', 'Raleigh', 'Greensboro', 'Durham', 'Winston-Salem', 'Fayetteville', 'Cary'],
#     'NE': ['Omaha', 'Lincoln', 'Bellevue', 'Grand Island', 'Kearney', 'Fremont', 'Hastings'],
#     'NV': ['Las Vegas', 'Henderson', 'Reno', 'North Las Vegas', 'Sparks', 'Carson City', 'Elko'],
#     'OH': ['Columbus', 'Cleveland', 'Cincinnati', 'Toledo', 'Akron', 'Dayton', 'Parma'],
#     'OK': ['Oklahoma City', 'Tulsa', 'Norman', 'Broken Arrow', 'Edmond', 'Lawton', 'Moore'],
#     'SC': ['Charleston', 'Columbia', 'North Charleston', 'Mount Pleasant', 'Rock Hill', 'Greenville', 'Summerville'],
#     'SD': ['Sioux Falls', 'Rapid City', 'Aberdeen', 'Brookings', 'Watertown', 'Mitchell', 'Yankton'],
#     'TN': ['Nashville', 'Memphis', 'Knoxville', 'Chattanooga', 'Clarksville', 'Murfreesboro', 'Franklin'],
#     'TX': ['Houston', 'San Antonio', 'Dallas', 'Austin', 'Fort Worth', 'El Paso', 'Arlington'],
#     'UT': ['Salt Lake City', 'West Valley City', 'Provo', 'West Jordan', 'Orem', 'Sandy', 'St. George'],
#     'WI': ['Milwaukee', 'Madison', 'Green Bay', 'Kenosha', 'Racine', 'Appleton', 'Waukesha'],
#     'WV': ['Charleston', 'Huntington', 'Morgantown', 'Parkersburg', 'Wheeling', 'Weirton', 'Fairmont'],
#     'WY': ['Cheyenne', 'Casper', 'Laramie', 'Gillette', 'Rock Springs', 'Sheridan', 'Green River']
# }




radiologic technologist;Montgomery
bakery owner;Little Rock
occupational therapist;Colorado Springs
heavy equipment operator;Newark
licensed counselor;Miami
locksmith;Augusta
professional organizer;Cedar Rapids
tax preparer;Aurora
travel agent;Evansville
moving company owner;Kansas City
web developer;Bowling Green
florist;Shreveport
UX/UI designer;Germantown
cybersecurity consultant;Springfield
hair extension specialist;Warren
psychiatrist;Missoula
language instructor;Charlotte
food truck owner;Lincoln
auto glass repair technician;Henderson
hair stylist;Cleveland
motivational speaker;Tulsa
plumber;North Charleston
private tutor;Sioux Falls
auto body repair specialist;Nashville
caterer;Dallas
social media manager;Salt Lake City
image consultant;Green Bay
food truck owner;Huntington
urban planner;Laramie


In [6]:
import requests, re
from bs4 import BeautifulSoup

regex = r"[0-9]+(?:\.[0-9]+){3}:[0-9]+"
c = requests.get("https://spys.me/proxy.txt")
test_str = c.text
a = re.finditer(regex, test_str, re.MULTILINE)
with open("proxies_list.txt", 'w') as file:
    for i in a:
       print(i.group(),file=file)
        
d = requests.get("https://free-proxy-list.net/")
soup = BeautifulSoup(d.content, 'html.parser')
td_elements = soup.select('.fpl-list .table tbody tr td')
ips = []
ports = []
for j in range(0, len(td_elements), 8):
    ips.append(td_elements[j].text.strip())
    ports.append(td_elements[j + 1].text.strip())
with open("proxies_list.txt", "a") as myfile:
    for ip, port in zip(ips, ports):
        proxy = f"{ip}:{port}"
        print(proxy, file=myfile)