In [1]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
from slugify import slugify

In [2]:
def parse_html_table(table):
    n_columns = 0
    n_rows = 0
    column_names = []

    # Find number of rows and columns
    # we also find the column titles if we can
    for row in table.find_all('tr'):

        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)

        # Handle column names if we find them
        th_tags = row.find_all('th') 
        if len(th_tags) > 0 and len(column_names) == 0:
            for th in th_tags:
                column_names.append(th.get_text())

    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

In [3]:
host = 'https://en.wikipedia.org'
head_route = '/wiki'

In [4]:
tail_route = '/List_of_physical_quantities'

route = head_route + tail_route
css_query = "#mw-content-text > div.mw-parser-output > table"

r = req.get(host+route)
soup = BeautifulSoup(r.text, 'html.parser')

tables = []
for el in soup.select(css_query):
  tables.append(el)

tables = [ tables[0], tables[2] ]

In [5]:
df_tables = []

for i in range(len(tables)):
    table = tables[i]
    table_tmp = parse_html_table(table).replace(r'\n',' ', regex=True).replace(r'→',' ', regex=True)
    table_tmp.columns = [ column.replace('\n', '') for column in table_tmp.columns ]
    
    df_tables.append(table_tmp) 

In [7]:
si_units = df_tables[0];

dimensions = list(si_units['Dimension'])
units = list(si_units['SI base unit'])

dimension_units = [
    {
        'dimension': dimension,
        'unit': unit.split(' ')[1].replace(')', '').replace('(', '')
    }
    for (dimension, unit) in zip(dimensions, units)
]

print(dimension_units)

[{'dimension': 'N ', 'unit': 'mol'}, {'dimension': 'L ', 'unit': 'm'}, {'dimension': 'T ', 'unit': 's'}, {'dimension': 'M ', 'unit': 'kg'}, {'dimension': 'Θ ', 'unit': 'K'}, {'dimension': 'I ', 'unit': 'A'}, {'dimension': 'J ', 'unit': 'cd'}]


In [10]:
for table in df_tables:
    table.columns = [ column.replace('\n', '') for column in table.columns ]

In [11]:
dimension_exponents = {
    dimension: 0 for dimension in dimensions
}

physical_quantities = df_tables[1]

quantities_dict = []
for _, quantity in physical_quantities.iterrows():
    dimension = quantity['Dimension']
    dimension_split = dimension.split(' ')
    dimension_split.remove('')
    
    curr_dimension_exp = dict(dimension_exponents);
    
    for component in dimension_split:
        if(len(component) == 1):
            symbol = component[0]
            curr_dimension_exp[symbol] = 1
        else:
            symbol = component[0]
            exponent = component[1:].replace('−', '-')
            curr_dimension_exp[symbol] = int(exponent)
    
    quantities_dict.append({
        "name": slugify(quantity['Derived quantity'].lower()),
        "symbol": quantity['Symbol'].strip(),
        "analysis": curr_dimension_exp
    })

{'N ': 0, 'L ': 0, 'T ': 0, 'M ': 0, 'Θ ': 0, 'I ': 0, 'J ': 0}


In [12]:
quantities_dict

[{'name': 'absement',
  'symbol': 'A',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'L': 1,
   'T': 1}},
 {'name': 'absorbed-dose-rate',
  'symbol': '',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'L': 2,
   'T': -3}},
 {'name': 'acceleration',
  'symbol': 'a',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'L': 1,
   'T': -2}},
 {'name': 'action',
  'symbol': 'S',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'L': 2,
   'M': 1,
   'T': -1}},
 {'name': 'angular-acceleration',
  'symbol': 'ωa',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'T': -2}},
 {'name': 'angular-momentum',
  'symbol': 'L',
  'analysis': {'N ': 0,
   'L ': 0,
   'T ': 0,
   'M ': 0,
   'Θ ': 0,
   'I ': 0,
   'J ': 0,
   'L': 2,
   'M': 1,
   'T': -1}}