In [1]:
from bs4 import BeautifulSoup
import chardet
import datetime
import duckdb
import io
import pandas as pd
import requests

In [2]:
DATA_PATH = '../data/'
PATH_TO_CSV = DATA_PATH+'csv/'                # input
PATH_TO_ONS = PATH_TO_CSV+'ons/'              # input
FILENAME_ONS_LIST = \
[
    "12063_E10000008_AllCountiesInCountry_England_latest.csv",
    "23125_E10000011_AllCountiesInCountry_England_latest.csv",
    "23125_E10000011_AllLaInUK_latest.csv",
    "3676_E10000008_AllCountiesInCountry_England_latest.csv",
    "6233_E10000008_AllCountiesInCountry_England_latest.csv",
    "6233_E10000008_AllLaInUK_latest.csv",
    "6234_E10000008_AllCountiesInCountry_England_latest.csv",
    "6234_E10000008_AllLaInUK_latest.csv",
    "6238_E10000008_AllCountiesInCountry_England_latest.csv",
    "6238_E10000008_AllLaInUK_latest.csv",
    "6605_E10000008_AllCountiesInCountry_England_latest.csv",
    "6605_E10000008_AllLaInUK_latest.csv",
]
FILENAME_OTHERS_LIST = \
[
    "bar-617-chart-5.csv",
    "local_authority_data_for_corporate_and_finance.csv",
    "searches.csv",
    "series-040525.csv",
    "series-050525 copy.csv",
    "series-050525-2.csv",
    "series-050525-3.csv",
    "series-050525-4.csv",
    "series-050525.csv",
    "sparql_result.csv",
    "Table-617-chart-8.csv",
]

In [11]:
filename.split('.')[0]

'6605_E10000008_AllLaInUK_latest'

In [3]:
def read_header_ons_county_data(filepath: str, desc_df: pd.DataFrame, nrows: int=6):
    f"""
    reads the header of a data file, assumed to be in the ons county by county dataset format
    """

    filepath = PATH_TO_ONS + filename

    # header part of the csv:
    temp = pd.read_csv(filepath, nrows=nrows, header=None)
    metric                = temp[3].iloc[0]
    dataset_id            = temp[3].iloc[1]
    applicable_date_range = temp[3].iloc[2]
    collection_date       = temp[3].iloc[3]
    dataset_unit          = temp[3].iloc[4]
    data_kind             = temp[3].iloc[5]
    
    desc_df.loc[len(desc_df)] = [
        dataset_id,
        metric,
        applicable_date_range,
        collection_date,
        dataset_unit,
        data_kind,
        filename,
        filepath
    ]
    return desc_df



In [4]:
ons_description_df = pd.DataFrame(
    columns=[
        'id',
        'metric',
        'date_range',
        'date_collected',
        'unit',
        'kind', 
        'filename',
        'filepath'
    ]
)
ons_description_df

Unnamed: 0,id,metric,date_range,date_collected,unit,kind,filename,filepath


# ons data overview

In [5]:
for filename in FILENAME_ONS_LIST:
    read_header_ons_county_data(filename, ons_description_df, nrows=6)

In [6]:
ons_description_df

Unnamed: 0,id,metric,date_range,date_collected,unit,kind,filename,filepath
0,12063,Percentage of physically active adults (aged 19+),2022/23,fin_2022_23,%,raw,12063_E10000008_AllCountiesInCountry_England_l...,../data/csv/ons/12063_E10000008_AllCountiesInC...
1,23125,"Total expenditure on arts, tourism, & the hist...",2023/24,fin_2023_24,GBP per person,raw,23125_E10000011_AllCountiesInCountry_England_l...,../data/csv/ons/23125_E10000011_AllCountiesInC...
2,23125,"Total expenditure on arts, tourism, & the hist...",2023/24,fin_2023_24,GBP per person,raw,23125_E10000011_AllLaInUK_latest.csv,../data/csv/ons/23125_E10000011_AllLaInUK_late...
3,3676,% of children at expected level in Expressive ...,2023/24 (academic),sch_2023_24,%,raw,3676_E10000008_AllCountiesInCountry_England_la...,../data/csv/ons/3676_E10000008_AllCountiesInCo...
4,6233,Employees - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6233_E10000008_AllCountiesInCountry_England_la...,../data/csv/ons/6233_E10000008_AllCountiesInCo...
5,6233,Employees - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6233_E10000008_AllLaInUK_latest.csv,../data/csv/ons/6233_E10000008_AllLaInUK_lates...
6,6234,Running expenses - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6234_E10000008_AllCountiesInCountry_England_la...,../data/csv/ons/6234_E10000008_AllCountiesInCo...
7,6234,Running expenses - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6234_E10000008_AllLaInUK_latest.csv,../data/csv/ons/6234_E10000008_AllLaInUK_lates...
8,6238,Total Income - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6238_E10000008_AllCountiesInCountry_England_la...,../data/csv/ons/6238_E10000008_AllCountiesInCo...
9,6238,Total Income - Arts development and support,2023/24,fin_2023_24,GBP (000),raw,6238_E10000008_AllLaInUK_latest.csv,../data/csv/ons/6238_E10000008_AllLaInUK_lates...


In [15]:
for idx, row in ons_description_df.iterrows():
    tablename  ='table_' + row['filename'].split('.')[0]
    filepath   =row['filepath']
    metric     =row['metric']
    unit       =row['unit']
    date_range =row['date_range']
    
    df = (
        pd.read_csv(filepath, skiprows=5, header=0)
        .rename(columns={'raw':metric})
        .assign(unit=unit, date_range=date_range)
    )
    display(df.head(3))
    
    with duckdb.connect(database = "../data/creatives.duckdb", read_only = False) as con:
        con.sql(f"DROP TABLE IF EXISTS {tablename}; CREATE TABLE {tablename} AS SELECT * FROM df;")
        con.sql(f"SELECT COUNT(*) FROM {tablename};")

Unnamed: 0,area label,area long label,area,Percentage of physically active adults (aged 19+),unit,date_range
0,Devon,Devon (County),E10000008,74.3,%,2022/23
1,Cambridgeshire,Cambridgeshire (County),E10000003,71.3,%,2022/23
2,Derbyshire,Derbyshire (County),E10000007,67.9,%,2022/23


Unnamed: 0,area label,area long label,area,"Total expenditure on arts, tourism, & the historic environment per head",unit,date_range
0,East Sussex,East Sussex (County),E10000011,0,GBP per person,2023/24
1,Cambridgeshire,Cambridgeshire (County),E10000003,1,GBP per person,2023/24
2,Derbyshire,Derbyshire (County),E10000007,2,GBP per person,2023/24


Unnamed: 0,area label,area long label,area,"Total expenditure on arts, tourism, & the historic environment per head",unit,date_range
0,East Sussex,East Sussex (County),E10000011,0,GBP per person,2023/24
1,Adur,Adur (District),E07000223,14,GBP per person,2023/24
2,Amber Valley,Amber Valley (District),E07000032,1,GBP per person,2023/24


Unnamed: 0,area label,area long label,area,% of children at expected level in Expressive arts and design,unit,date_range
0,Devon,Devon (County),E10000008,88.8,%,2023/24 (academic)
1,Cambridgeshire,Cambridgeshire (County),E10000003,86.3,%,2023/24 (academic)
2,Derbyshire,Derbyshire (County),E10000007,85.3,%,2023/24 (academic)


Unnamed: 0,area label,area long label,area,Employees - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Cambridgeshire,Cambridgeshire (County),E10000003,0,GBP (000),2023/24
2,Derbyshire,Derbyshire (County),E10000007,58,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,Employees - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Adur,Adur (District),E07000223,0,GBP (000),2023/24
2,Amber Valley,Amber Valley (District),E07000032,0,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,Running expenses - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Cambridgeshire,Cambridgeshire (County),E10000003,0,GBP (000),2023/24
2,Derbyshire,Derbyshire (County),E10000007,69,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,Running expenses - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Adur,Adur (District),E07000223,0,GBP (000),2023/24
2,Amber Valley,Amber Valley (District),E07000032,0,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,Total Income - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Cambridgeshire,Cambridgeshire (County),E10000003,0,GBP (000),2023/24
2,Derbyshire,Derbyshire (County),E10000007,43,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,Total Income - Arts development and support,unit,date_range
0,Devon,Devon (County),E10000008,0,GBP (000),2023/24
1,Adur,Adur (District),E07000223,0,GBP (000),2023/24
2,Amber Valley,Amber Valley (District),E07000032,0,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,"Total expenditure on arts, tourism and historic environment",unit,date_range
0,Devon,Devon (County),E10000008,112,GBP (000),2023/24
1,Cambridgeshire,Cambridgeshire (County),E10000003,737,GBP (000),2023/24
2,Derbyshire,Derbyshire (County),E10000007,1735,GBP (000),2023/24


Unnamed: 0,area label,area long label,area,"Total expenditure on arts, tourism and historic environment",unit,date_range
0,Devon,Devon (County),E10000008,112,GBP (000),2023/24
1,Adur,Adur (District),E07000223,911,GBP (000),2023/24
2,Amber Valley,Amber Valley (District),E07000032,186,GBP (000),2023/24


12063_E10000008_AllCountiesInCountry_England_latest


In [16]:
import datetime
datetime.datetime.now().strftime('%Y-%m-%d %H:%M')


'2025-05-09 02:03'