# IDS Integrated Data Asset (IDA) Linkage demo

This script is broadly intended to demonstrate how easily RDMF indexed IDAs can be linked and thus prepared for analysts to use.

In [2]:
# import needed packages (not many!)
from google.cloud import bigquery
import pandas as pd

In [130]:
df = pd.DataFrame({'geography_entry_id_region_code': [500000000000008, 500000000000009, 500000000000010, 500000000000011,
                                                     500000000000012, 500000000000013, 500000000000014, 500000000000015,
                                                     500000000000016],
                  'geography_layer_code': ['E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006',
                                          'E12000007', 'E12000008', 'E12000009'],
                  'region_name': ['North East', 'North West', 'Yorkshire and The Humber', 'East Midlands', 'West Midlands',
                                  'East of England', 'London', 'South East', 'South West']
                  })

In [132]:
df

Unnamed: 0,geography_entry_id_region_code,geography_layer_code,region_name
0,500000000000008,E12000001,North East
1,500000000000009,E12000002,North West
2,500000000000010,E12000003,Yorkshire and The Humber
3,500000000000011,E12000004,East Midlands
4,500000000000012,E12000005,West Midlands
5,500000000000013,E12000006,East of England
6,500000000000014,E12000007,London
7,500000000000015,E12000008,South East
8,500000000000016,E12000009,South West


In [138]:
df.to_gbq('demo_wip_notebook.geography_index_lookup', 
                 'ons-ids-analysis-prod',
                 chunksize=None, # I have tried with several chunk sizes, it runs faster when it's one big chunk (at least for me)
                 )

100%|██████████| 1/1 [00:00<00:00, 5924.16it/s]


In [137]:
import pandas-gbq

SyntaxError: invalid syntax (3409532940.py, line 1)

In [120]:
# joining OSPOS/LRPP linked data with EPC data
# may need to exclude columns before the join:
# EXCEPT (geography_entry_id_postcode, uprn_hashed, postcode_hashed, guid, property_type, address_entry_id_uprn)

query = ("""

SELECT
  geography_entry_id_region_code
FROM
  `ons-ids-analysis-prod.demo_wip_notebook.ospos_lrpp_linked`
   
""")

query_job = client.query(
    query,
    location="europe-west2",
)

linked_data = query_job.to_dataframe()

In [129]:
pd.merge(df, linked_data, on = 'geography_entry_id_region_code', how = 'inner')

Unnamed: 0,geography_entry_id_region_code,geography_layer_code
0,500000000000008,E12000001
1,500000000000008,E12000001
2,500000000000008,E12000001
3,500000000000008,E12000001
4,500000000000008,E12000001
...,...,...
24612807,500000000000016,E12000009
24612808,500000000000016,E12000009
24612809,500000000000016,E12000009
24612810,500000000000016,E12000009


In [123]:
linked_data.geography_entry_id_region_code.value_counts()

500000000000015.000    4487211
500000000000014.000    3213019
500000000000009.000    3124714
500000000000013.000    3009032
500000000000016.000    2811756
500000000000010.000    2339805
500000000000012.000    2320427
500000000000011.000    2188952
500000000000008.000    1117896
Name: geography_entry_id_region_code, dtype: int64

In [3]:
# initiate client for bigquery
client = bigquery.Client(location="europe-west2")

First, we need to link together the `std` and `georef` versions of the required assets, so that we have all of the data's attributes, along with it's geographic information that will link back to the address/geography indexes. These each have `guid` columns this can be done with.

First, let's do this for the Energy Performance Certificates data, which will give us insights into housing energy efficiency:

In [12]:
# linking EPC std and georef data
query = ("""

SELECT
 *
FROM
  `some_path_epc_georef` t1
JOIN
  `some_path_epc_std` t2
ON
  t1.id = t2.guid

""")

table_ref = 'ons-ids-analysis-prod.demo_wip_notebook.EPC_linked' # table name reference for saving output

job_config = bigquery.QueryJobConfig(
    destination= table_ref
) # specify table destination in job config

query_job = client.query(
    query,
    location="europe-west2",
    job_config = job_config
)  # API request - starts the query

query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fec2c1be890>

Next, let's do the same for the Land Registry Price Paid (LRPP) data, which will give us house sale price information:

In [47]:
# joining georef and std LRPP tables
query = ("""

SELECT
  *
FROM
  `ingest_ida_land_registry_price_paid.prices_paid_georef` t1
JOIN
  `ingest_ida_land_registry_price_paid.prices_paid_std` t2
ON
  t1.id = t2.guid

""")
table_ref = 'ons-ids-analysis-prod.demo_wip_notebook.lrpp_linked'

job_config = bigquery.QueryJobConfig(
    destination= table_ref
)

query_job = client.query(
    query,
    location="europe-west2",
    job_config=job_config
)


query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7fec26702b90>

Finally, and this is the bit where the Integrated Data Asset version of these datasets comes in handy, we can link all of these sources together on their `address_entry_id` variables. This is the lookup to the RDMF, specifically the address index, and is a unique identifier in this case.

In [103]:
# joining OSPOS and (newly joined) LRPP tables
query = ("""

SELECT
  geography_entry_id_lsoa_code,
  geography_entry_id_oa_code,
  geography_entry_id_lad_code,
  geography_entry_id_msoa_code,
  geography_entry_id_country_code,
  geography_entry_id_region_code,
  lsoa_code_hashed,
  oa_code_hashed,
  lad_code_hashed,
  msoa_code_hashed,
  country_code_hashed,
  region_code_hashed,
  class,
  voa_ct_record,
  ruc_code,
  ruc,
  ur2fold,
  ur3fold,
  ur6fold,
  ur8fold,
  building_flat_count,
  private_outdoor_space,
  private_outdoor_space_area,
  t1.address_entry_id_uprn,
  t2.address_entry_id_uprn AS t2_address_entry_id_uprn,
  t2.id,
  apiversion,
  confidencescore,
  epoch,
  underlyingscore,
  t2.geography_entry_id_postcode,
  t2.postcode_hashed,
  t2.guid,
  t2.transaction_uid,
  price,
  date_of_transfer,
  t2.property_type,
  old_new,
  duration,
  locality,
  ppd_category_type,
  derived_year
FROM
  `ons-ids-data-prod.ingest_ida_ordnance_survey_private_outside_space.april_gb_private_outdoor_space_std` AS t1
JOIN
  `ons-ids-analysis-prod.demo_wip_notebook.lrpp_linked` AS t2
ON
  t1.address_entry_id_uprn = t2.address_entry_id_uprn
  
""")

table_ref = 'ons-ids-analysis-prod.demo_wip_notebook.ospos_lrpp_linked'

job_config = bigquery.QueryJobConfig(
    destination= table_ref
)

job_config.write_disposition = "WRITE_TRUNCATE"

query_job = client.query(
    query,
    location="europe-west2",
    job_config=job_config
)

query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x7f5c033f0910>

In [None]:
# joining OSPOS/LRPP linked data with EPC data
# may need to exclude columns before the join:
# EXCEPT (geography_entry_id_postcode, uprn_hashed, postcode_hashed, guid, property_type, address_entry_id_uprn)

query = ("""

SELECT
  *
FROM
  `ons-ids-analysis-prod.demo_wip_notebook.ospos_lrpp_linked` t1
JOIN
  `ons-ids-analysis-prod.demo_wip_notebook.EPC_linked` t2
ON
  t1.address_entry_id_uprn = t2.address_entry_id_uprn;
    
""")

table_ref = 'ons-ids-analysis-prod.demo_wip_notebook.ospos_lrpp_epc_linked'

job_config = bigquery.QueryJobConfig(
    destination= table_ref
)

query_job = client.query(
    query,
    location="europe-west2",
    job_config=job_config
)

query_job.result()

## Non-matches and bias

In [None]:
# joining georef and std LRPP tables
query = ("""
SELECT 

 address_entry_id_uprn

FROM 
 `ons-ids-analysis-prod.demo_wip_notebook.lrpp_linked`

""")

query_job = client.query(
    query,
    location="europe-west2",
)  # API request - starts the query


df = query_job.to_dataframe()

For cases where records did not match to an RDMF index, for example the address index, their `address_entry_id` will be null. This allows identification of the non-matching records

In [112]:
unmatched  = len(df[df.address_entry_id_uprn.isnull() == True])
matched  = len(df[df.address_entry_id_uprn.isnull() == False])

print("the match rate is:", ((matched-unmatched)/matched)*100)

the match rate is: 99.98994303084024


The analysts can then easily explore the characteristics of the non-linking data. For instance, exploring the trends in matches vs non-matches by region:

In [10]:
# read in geography index lookup, join on geography_entry_region_code

# hashed values - can aggregate on these hashed values to explore bias (i.e. same hashed uprn for same hh)

# match rate by region

26886828

In [None]:
# exploring LRPP join
query = ("""
SELECT 
COUNT(*) as numRecords

FROM `ingest_ida_land_registry_price_paid.prices_paid_georef` t1
JOIN `ingest_ida_land_registry_price_paid.prices_paid_std` t2

ON t1.id = t2.guid;
""")

query_job = client.query(
    query,
    location="europe-west2",
)  # API request - starts the query

df  = query_job.to_dataframe()

In [30]:
# all 26889532 records join
df

Unnamed: 0,numRecords
0,26889532


In [16]:
# exploring LRPP columns, querying where a false condition is true to just return column names
query = ("""
SELECT * 

FROM `ingest_ida_land_registry_price_paid.prices_paid_std`

WHERE 1 = 0
""")

query_job = client.query(
    query,
    location="europe-west2",
)  # API request - starts the query

df  = query_job.to_dataframe()


In [17]:
df.columns.tolist()

['geography_entry_id_postcode',
 'postcode_hashed',
 'guid',
 'transaction_uid',
 'price',
 'date_of_transfer',
 'property_type',
 'old_new',
 'duration',
 'locality',
 'ppd_category_type',
 'derived_year']