In [1]:
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass

In [20]:
BASE_URL = "https://naics.askkodiak.com/naics/2022/"
example_url = BASE_URL + "/11"

In [3]:
response = requests.get(example_url)
response.raise_for_status()

In [4]:
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
overview = soup.find(attrs={"class": "overview subsection"}).find_all("p")
overview_text = "\n".join([p.get_text() for p in overview])
overview_text = overview_text.replace("\n", " ")

overview_text.strip()

'The Agriculture, Forestry, Fishing and Hunting sector comprises establishments primarily engaged in growing crops, raising animals, harvesting timber, and harvesting fish and other animals from a farm, ranch, or their natural habitats.                                   The establishments in this sector are often described as farms, ranches, dairies, greenhouses, nurseries, orchards, or hatcheries.  A farm may consist of a single tract of land or a number of separate tracts which may be held under different tenures.  For example, one tract may be owned by the farm operator and another rented.  It may be operated by the operator alone or with the assistance of members of the household or hired employees, or it may be operated by a partnership, corporation, or other type of organization. When a landowner has one or more tenants, renters, croppers, or managers, the land operated by each is considered a farm.                                   The sector distinguishes two basic activities: 

In [6]:
descendants = soup.find(attrs={"class": "descendants subsection"}).find_all("li")

In [7]:
all_href = [descendant.a["href"] for descendant in descendants]
all_href = [descendant.split("/")[-1] for descendant in all_href]

all_href

['111',
 '1111',
 '11111',
 '111110',
 '11112',
 '111120',
 '11113',
 '111130',
 '11114',
 '111140',
 '11115',
 '111150',
 '11116',
 '111160',
 '11119',
 '111191',
 '111199',
 '1112',
 '11121',
 '111211',
 '111219',
 '1113',
 '11131',
 '111310',
 '11132',
 '111320',
 '11133',
 '111331',
 '111332',
 '111333',
 '111334',
 '111335',
 '111336',
 '111339',
 '1114',
 '11141',
 '111411',
 '111419',
 '11142',
 '111421',
 '111422',
 '1119',
 '11191',
 '111910',
 '11192',
 '111920',
 '11193',
 '111930',
 '11194',
 '111940',
 '11199',
 '111991',
 '111992',
 '111998',
 '112',
 '1121',
 '11211',
 '112111',
 '112112',
 '11212',
 '112120',
 '11213',
 '112130',
 '1122',
 '11221',
 '112210',
 '1123',
 '11231',
 '112310',
 '11232',
 '112320',
 '11233',
 '112330',
 '11234',
 '112340',
 '11239',
 '112390',
 '1124',
 '11241',
 '112410',
 '11242',
 '112420',
 '1125',
 '11251',
 '112511',
 '112512',
 '112519',
 '1129',
 '11291',
 '112910',
 '11292',
 '112920',
 '11293',
 '112930',
 '11299',
 '112990',
 '113'

In [11]:
@dataclass
class BaseNaics:
    code: str
    title: str
    description: str
    descendants_code: List[str]

    @classmethod
    def from_code(cls, code: str):
        url = f"{BASE_URL}{code}"
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.find("h1").get_text()

        overview = soup.find(attrs={"class": "overview subsection"}).find_all("p")
        description = "\n".join([p.get_text() for p in overview])
        description = description.strip()

        descendants = soup.find(attrs={"class": "descendants subsection"}).find_all(
            "li"
        )
        all_href = [descendant.a["href"] for descendant in descendants]
        all_codes = [descendant.split("/")[-1] for descendant in all_href]

        return cls(code, title, description, all_codes)

In [28]:
@dataclass
class Naics:
    code: str
    title: str
    description: str

    @classmethod
    def from_code(cls, code: str):
        url = f"{BASE_URL}{code}"
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.find("h1").get_text()

        overview = soup.find(attrs={"class": "overview subsection"}).find_all("p")
        description = "\n".join([p.get_text() for p in overview])
        description = description.strip()

        return cls(code, title, description)

In [13]:
response = requests.get(BASE_URL)
response.raise_for_status()

In [14]:
soup = BeautifulSoup(response.text, "html.parser")

In [15]:
import pandas as pd
from io import StringIO

In [16]:
df = pd.read_html(BASE_URL)[0]

In [17]:
df

Unnamed: 0,Sector,Description
0,11,"Agriculture, Forestry, Fishing and Hunting"
1,21,"Mining, Quarrying, and Oil and Gas Extraction"
2,22,Utilities
3,23,Construction
4,31-33,Manufacturing
5,42,Wholesale Trade
6,44-45,Retail Trade
7,48-49,Transportation and Warehousing
8,51,Information
9,52,Finance and Insurance


In [18]:
print(df["Sector"].to_list())

['11', '21', '22', '23', '31-33', '42', '44-45', '48-49', '51', '52', '53', '54', '55', '56', '61', '62', '71', '72', '81', '92']


In [21]:
all_base_naics: List[BaseNaics] = []

for sector in df["Sector"].to_list():
    all_base_naics.append(BaseNaics.from_code(sector))

In [22]:
all_base_naics

[BaseNaics(code='11', title='Agriculture, Forestry, Fishing and Hunting', description='The Agriculture, Forestry, Fishing and Hunting sector comprises establishments primarily engaged in growing crops, raising animals, harvesting timber, and harvesting fish and other animals from a farm, ranch, or their natural habitats.\n              \n\n\n\n                The establishments in this sector are often described as farms, ranches, dairies, greenhouses, nurseries, orchards, or hatcheries.  A farm may consist of a single tract of land or a number of separate tracts which may be held under different tenures.  For example, one tract may be owned by the farm operator and another rented.  It may be operated by the operator alone or with the assistance of members of the household or hired employees, or it may be operated by a partnership, corporation, or other type of organization. When a landowner has one or more tenants, renters, croppers, or managers, the land operated by each is considere

In [23]:
base_naics_df = pd.DataFrame(all_base_naics)

In [24]:
base_naics_df

Unnamed: 0,code,title,description,descendants_code
0,11,"Agriculture, Forestry, Fishing and Hunting","The Agriculture, Forestry, Fishing and Hunting...","[111, 1111, 11111, 111110, 11112, 111120, 1111..."
1,21,"Mining, Quarrying, and Oil and Gas Extraction","The Mining, Quarrying, and Oil and Gas Extract...","[211, 2111, 21112, 211120, 21113, 211130, 212,..."
2,22,Utilities,The Utilities sector comprises establishments ...,"[221, 2211, 22111, 221111, 221112, 221113, 221..."
3,23,Construction,The Construction sector comprises establishmen...,"[236, 2361, 23611, 236115, 236116, 236117, 236..."
4,31-33,Manufacturing,The Manufacturing sector comprises establishme...,"[311, 3111, 31111, 311111, 311119, 3112, 31121..."
5,42,Wholesale Trade,The Wholesale Trade sector comprises establish...,"[423, 4231, 42311, 423110, 42312, 423120, 4231..."
6,44-45,Retail Trade,The Retail Trade sector comprises establishmen...,"[441, 4411, 44111, 441110, 44112, 441120, 4412..."
7,48-49,Transportation and Warehousing,The Transportation and Warehousing sector incl...,"[481, 4811, 48111, 481111, 481112, 4812, 48121..."
8,51,Information,The Information sector comprises establishment...,"[512, 5121, 51211, 512110, 51212, 512120, 5121..."
9,52,Finance and Insurance,The Finance and Insurance sector comprises est...,"[521, 5211, 52111, 521110, 522, 5221, 52211, 5..."


In [30]:
all_naics: List[Naics] = []

for base in all_base_naics:
    for desc_code in base.descendants_code:
        try:
            all_naics.append(Naics.from_code(desc_code))
        except:
            print(f"Failed to get {desc_code}")

In [34]:
naics_df = pd.DataFrame(all_naics)

In [36]:
naics_df

Unnamed: 0,code,title,description
0,111,Crop Production,Industries in the Crop Production subsector gr...
1,1111,Oilseed and Grain Farming,This industry group comprises establishments p...
2,11111,Soybean Farming,See industry description for 111110.\n ...
3,111110,Soybean Farming,This industry comprises establishments primari...
4,11112,Oilseed (except Soybean) Farming,See industry description for 111120.\n ...
...,...,...,...
2100,9281,National Security and International Affairs,No description provided for this industry grou...
2101,92811,National Security,See industry description for 928110.\n ...
2102,928110,National Security,This industry comprises government establishme...
2103,92812,International Affairs,See industry description for 928120.\n ...


In [48]:
all_naics_df = pd.concat([base_naics_df, naics_df], ignore_index=True)

In [49]:
all_naics_df.to_parquet("all_naics.parquet")

In [52]:
all_naics_df.to_csv("all_naics.csv", index=False)

In [50]:
df = pd.read_parquet("all_naics.parquet")

In [51]:
df

Unnamed: 0,code,title,description,descendants_code
0,11,"Agriculture, Forestry, Fishing and Hunting","The Agriculture, Forestry, Fishing and Hunting...","[111, 1111, 11111, 111110, 11112, 111120, 1111..."
1,21,"Mining, Quarrying, and Oil and Gas Extraction","The Mining, Quarrying, and Oil and Gas Extract...","[211, 2111, 21112, 211120, 21113, 211130, 212,..."
2,22,Utilities,The Utilities sector comprises establishments ...,"[221, 2211, 22111, 221111, 221112, 221113, 221..."
3,23,Construction,The Construction sector comprises establishmen...,"[236, 2361, 23611, 236115, 236116, 236117, 236..."
4,31-33,Manufacturing,The Manufacturing sector comprises establishme...,"[311, 3111, 31111, 311111, 311119, 3112, 31121..."
...,...,...,...,...
2120,9281,National Security and International Affairs,No description provided for this industry grou...,
2121,92811,National Security,See industry description for 928110.\n ...,
2122,928110,National Security,This industry comprises government establishme...,
2123,92812,International Affairs,See industry description for 928120.\n ...,
