In [118]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### Script to automate the extraction and processing of the industry level employment data from local HTML files of each state. These files/ web pages further have links to each industry's data for that state.  

### Using BeautifulSoup, the below code first reads each HTML file from a local directory and based on the pattern in the industry URLs, this code specifically searches the specific attributes. It then constructs the full URLs to scrape data from. 

### Next, the script creates a state-specific folder (based on the HTML filename) in the output directory. Each industry URL then is processed by a function called process_url, which scrapes the relevant data, converts it into a DataFrame, and saves it as a CSV file.

### Output is : 51 state specific folders are created. Each folder had 12 industry specific employment data as csv files.


In [132]:

def process_url(url, state_folder):
    response = requests.get(url)
    html_content = response.content

    soup = BeautifulSoup(html_content, 'html.parser')

    table = soup.find('table', {'id': 'table0', 'class': 'regular-data'})
    header_row = table.find('thead').find_all('th')
    header = [th.get_text(strip=True) for th in header_row]

    data = []
    rows = table.find('tbody').find_all('tr')
    for row in rows:
        cells = row.find_all(['th', 'td'])
        row_data = [cell.get_text(strip=True) for cell in cells]
        data.append(row_data)

    df = pd.DataFrame(data, columns=header)
    df = df.apply(pd.to_numeric, errors='ignore')

    state_row = soup.find(text='State:')
    state = state_row.find_next('td').get_text(strip=True).lower()

    industry_row = soup.find(text='Industry:')
    industry = industry_row.find_next('td').get_text(strip=True).replace(' ', '_').lower()

    csv_file_name = os.path.join(state_folder, f"{state}_{industry}.csv")
    df.to_csv(csv_file_name, index=False)

    return df

html_files_directory = '/Users/alks/Downloads/DDA13/Python/capstone/data/raw_files/industry_htmls/'

output_directory = '/Users/alks/Downloads/DDA13/Python/capstone/data/cleaned_files/industry_state/'

os.makedirs(output_directory, exist_ok=True)

for html_file in os.listdir(html_files_directory):
    if html_file.endswith('.html'):
        file_path = os.path.join(html_files_directory, html_file)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

    
        soup = BeautifulSoup(html_content, 'html.parser')

        links = soup.find_all('a', href=True)

        urls = []
        for link in links:
            href = link['href']
           
            if 'timeseries' in href:
                if href.startswith('/'):
                    full_url = f'https://www.bls.gov{href}'
                else:
                    full_url = href
                urls.append(full_url)

        # Keep only URLs from the 5th one onwards and remove duplicates
        unique_urls = list(dict.fromkeys(urls[4:]))

        state_name = html_file.split('_')[0].lower()
        state_folder = os.path.join(output_directory, state_name)
        os.makedirs(state_folder, exist_ok=True)

        for url in unique_urls:
            df = process_url(url, state_folder)
            


  state_row = soup.find(text='State:')
  industry_row = soup.find(text='Industry:')
