In [None]:
'''
Aaron Chen
aaron.chen@spc.ox.ac.uk

SIC code dictionary scraper

Scrapes the SIC category table from the companies house website. Returns a Dataframe 'category_df.csv'. 
'''

In [23]:
import requests
import pandas as pd

In [3]:
URL = 'https://resources.companieshouse.gov.uk/sic/'
page = requests.get(URL)
print(page.text)

<!DOCTYPE html>
<!--[if lt IE 9]><html class="lte-ie8" lang=""><![endif]-->
<!--[if gt IE 8]><!-->
<html class=" js no-touch" lang="en">
  <!--<![endif]-->
  <head>
    <meta http-equiv="content-type" content="text/html; charset=UTF-8">

<title>Nature of business: Standard Industrial Classification (SIC) codes</title>
<!--[if gt IE 8]><!--><link href="/dashboard/companies-house_files/govuk-template.css" media="screen" rel="stylesheet" type="text/css"><!--<![endif]-->
<!--[if IE 6]><link href="/dashboard/companies-house_files/govuk-template-ie6.css" media="screen" rel="stylesheet" type="text/css" /><![endif]-->
<!--[if IE 7]><link href="/dashboard/companies-house_files/govuk-template-ie7.css" media="screen" rel="stylesheet" type="text/css" /><![endif]-->
<!--[if IE 8]><link href="/dashboard/companies-house_files/govuk-template-ie8.css" media="screen" rel="stylesheet" type="text/css" /><![endif]-->
<link href="/dashboard/companies-house_files/govuk-template-print.css" media="print" rel="

In [4]:
from scrapy import Selector

In [8]:
sel = Selector(text = page.text)

In [22]:
sel.xpath('//table/tbody/tr').extract_first()

'<tr>\n        <td style="width:15%"><strong>Section A</strong></td>\n        <td><strong>Agriculture, Forestry and Fishing</strong></td>\n      </tr>'

In [24]:
for line in sel.xpath('//table/tbody/tr').extract(): 
    print(line.find(class_='strong'))

TypeError: find() takes no keyword arguments

In [25]:
from bs4 import BeautifulSoup

In [26]:
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.title)

<title>Nature of business: Standard Industrial Classification (SIC) codes</title>


In [37]:
table = soup.find('table' ,attrs={'class':'generalTable'})
table.find_all('tbody')

[<tbody>
 <tr>
 <td style="width:15%"><strong>Section A</strong></td>
 <td><strong>Agriculture, Forestry and Fishing</strong></td>
 </tr>
 <tr>
 <td>01110</td>
 <td>Growing of cereals (except rice), leguminous crops and oil seeds </td>
 </tr>
 <tr>
 <td>01120</td>
 <td>Growing of rice </td>
 </tr>
 <tr>
 <td>01130</td>
 <td>Growing of vegetables and melons, roots and tubers </td>
 </tr>
 <tr>
 <td>01140</td>
 <td>Growing of sugar cane </td>
 </tr>
 <tr>
 <td>01150</td>
 <td>Growing of tobacco </td>
 </tr>
 <tr>
 <td>01160</td>
 <td>Growing of fibre crops </td>
 </tr>
 <tr>
 <td>01190</td>
 <td>Growing of other non-perennial crops </td>
 </tr>
 <tr>
 <td>01210</td>
 <td>Growing of grapes </td>
 </tr>
 <tr>
 <td>01220</td>
 <td>Growing of tropical and subtropical fruits </td>
 </tr>
 <tr>
 <td>01230</td>
 <td>Growing of citrus fruits </td>
 </tr>
 <tr>
 <td>01240</td>
 <td>Growing of pome fruits and stone fruits </td>
 </tr>
 <tr>
 <td>01250</td>
 <td>Growing of other tree and bush fruit

In [43]:
tbody = table.find('tbody')

In [44]:
trs = tbody.find_all('tr')

In [46]:
for tr in trs: 
    tds = tr.find_all('td')
    for td in tds: 
        print(td.text)

Section A
Agriculture, Forestry and Fishing
01110
Growing of cereals (except rice), leguminous crops and oil seeds 
01120
Growing of rice 
01130
Growing of vegetables and melons, roots and tubers 
01140
Growing of sugar cane 
01150
Growing of tobacco 
01160
Growing of fibre crops 
01190
Growing of other non-perennial crops 
01210
Growing of grapes 
01220
Growing of tropical and subtropical fruits 
01230
Growing of citrus fruits 
01240
Growing of pome fruits and stone fruits 
01250
Growing of other tree and bush fruits and nuts 
01260
Growing of oleaginous fruits 
01270
Growing of beverage crops 
01280
Growing of spices, aromatic, drug and pharmaceutical crops 
01290
Growing of other perennial crops 
01300
Plant propagation 
01410
Raising of dairy cattle 
01420
Raising of other cattle and buffaloes 
01430
Raising of horses and other equines 
01440
Raising of camels and camelids 
01450
Raising of sheep and goats 
01460
Raising of swine/pigs 
01470
Raising of poultry 
01490
Raising of oth

In [128]:
code = []
description = []
section = []
section_codes = []


for tr in trs: 
    tds = tr.find_all('td')
    if tds[0].find("strong"): 
        section_code = tds[0].text.split(' ')[1]
        current_section = tds[1]
    else: 
        code.append(tds[0].text)
        description.append(tds[1].text)
        section.append(current_section.text)
        section_codes.append(section_code)


In [129]:
section

['Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agriculture, Forestry and Fishing',
 'Agricultur

In [130]:
category_df = pd.DataFrame()
category_df['section_code'], category_df['section'], category_df['SIC_code'], category_df['description'] = [section_codes, section, code, description]
category_df

Unnamed: 0,section_code,section,SIC_code,description
0,A,"Agriculture, Forestry and Fishing",01110,"Growing of cereals (except rice), leguminous c..."
1,A,"Agriculture, Forestry and Fishing",01120,Growing of rice
2,A,"Agriculture, Forestry and Fishing",01130,"Growing of vegetables and melons, roots and tu..."
3,A,"Agriculture, Forestry and Fishing",01140,Growing of sugar cane
4,A,"Agriculture, Forestry and Fishing",01150,Growing of tobacco
...,...,...,...,...
726,T,Activities of households as employers; undiffe...,98000,Residents property management
727,T,Activities of households as employers; undiffe...,98100,Undifferentiated goods-producing activities of...
728,T,Activities of households as employers; undiffe...,98200,Undifferentiated service-producing activities ...
729,U,Activities of extraterritorial organisations a...,99000,Activities of extraterritorial organisations a...


In [137]:
category_df.section_code.value_counts()

section_code
C    259
G    103
N     44
A     40
K     34
H     33
M     33
J     32
F     25
S     19
R     17
B     16
I     13
Q     13
P     12
E      9
O      9
D      8
L      6
T      4
U      2
Name: count, dtype: int64

In [136]:
category_df.to_csv('category_df.csv', index=False)