# Experimenting with possible approaches

In [22]:
!pip install -r requirements.txt

Collecting libpysal (from -r requirements.txt (line 3))
  Downloading libpysal-4.12.0-py3-none-any.whl.metadata (4.8 kB)
Downloading libpysal-4.12.0-py3-none-any.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: libpysal
Successfully installed libpysal-4.12.0
[0m

## Imports

In [23]:
import numpy as np
import pandas as pd
import pymc as pm
import geopy.distance
import geopandas as gpd
import matplotlib.pyplot as plt
import libpysal as ps

## 1. Data compilation

In [None]:
# assumptions
# source - https://www.icaew.com/insights/viewpoints-on-the-news/2022/sept-2022/chart-of-the-week-energy-price-cap-update
GAS_PRICE_PER_KWH = 3.3
ELECTRIC_PRICE_PER_KWH = 19.0 

In [None]:
# look at the headline dataset of consumption by LSOA
main_data = pd.read_csv("data/LSOA Energy Consumption Data.csv")

In [None]:
main_data.shape

In [None]:
main_data['Lower Layer Super Output Area (LSOA) Code'].nunique()

In [None]:
# Library to work with netCDF files
from netCDF4 import Dataset

file_name = "data/tas_hadukgrid_uk_60km_ann_202101-202112.nc"
file_id = Dataset(file_name)

latitude = file_id.variables["latitude"][:,:]
longitude = file_id.variables["longitude"][:,:]
temps = file_id.variables["tas"][:,:]

lats = [np.mean(x) for x in latitude]
longs = [np.mean(x) for x in longitude] 
ts = [np.mean(x) for x in temps[0]]
temp_data = pd.DataFrame({"latitude": lats,
                          "longitude": longs,
                          "temperature": ts}
                        )

temp_data = temp_data[temp_data.temperature > 0]

### Combining and generating features

In [None]:
# feature generation
main_data["pct_electric"] = main_data['Electricity Consumption (kWh)'] / main_data['Total Energy Consumption (kWh)']
main_data["coords"] = [(lat, long) for lat, long in zip(main_data.Latitude, main_data.Longitude)]

df = main_data[['Local Authority Name', 'Local Authority Code', 'MSOA Name',
       'Middle Layer Super Output Area (MSOA) Code', 'LSOA Name',
       'Lower Layer Super Output Area (LSOA) Code', 'coords',
       'pct_electric', 'Average Energy Consumption per Person (kWh)']]

df.columns = ['LA_name', 'LA', 'MSOA_ame',
       'MSOA', 'LSOA_name',
       'LSOA', 'coords',
       'pct_electric', 'energy_consumption_per_person']

In [None]:
list(zip(lats, longs))

In [None]:
# add temperature data
coords =  [(lat, long) for lat, long in zip(temp_data.latitude, temp_data.longitude)]
temp_dict = {co:t for co,t in zip(coords, temp_data.temperature)}

def find_closest_temp_measurement(this_point):
    return temp_dict[min(temp_dict.keys(), key=lambda x: geopy.distance.geodesic(this_point, x))]

df["temperature"] = [find_closest_temp_measurement(x) for x in df.coords]

In [None]:
# compute energy cost
df["energy_cost"] = [ELECTRIC_PRICE_PER_KWH * x + GAS_PRICE_PER_KWH * (1-x) for x in df["pct_electric"]]

In [None]:
# add income data
income_data = pd.read_csv("data/net_income_after_housing_costs.csv")
income_data = income_data[["MSOA code", "Net annual income after housing costs (£)"]].copy()
income_data.columns = ["MSOA", "net_income"]
df = df.merge(income_data, on="MSOA", how="left")

In [None]:
income_data

In [None]:
df.shape

In [None]:
# add green data
voting_data = pd.read_csv("data/CBP09228_detailed_results_England_elections.csv")
voting_data["pct_green"] = voting_data["Green"] / voting_data["Total"]
voting_data["green_council"] = voting_data["pct_green"] >= 0.1
voting_data = voting_data[["ONS code", "green_council"]].copy()
voting_data.columns = ["LA", "politically_green"]
df = df.merge(voting_data, on="LA", how="left")

In [None]:
df.shape

In [None]:
# add employment status
economic_activity = pd.read_csv("data/economic_activity.csv")
economic_activity = economic_activity[["Area code", "Economically active: \nIn employment \n(including full-time students), \n2021\n(percent)"]]
economic_activity.columns = ["LA", "pct_economically_active"]
df = df.merge(economic_activity, on="LA", how="left")

In [None]:
df.shape

In [None]:
# add in home occupancy data
households = pd.read_csv("data/RM202-Household-Size-By-Number-Of-Rooms-2021-lsoa-ONS.csv")
households.rename(columns={"Lower layer Super Output Areas Code": "LSOA"}, inplace=True)
households["pct_home_occupancy"] = households["Household size (5 categories) Code"] / households["Number of rooms (Valuation Office Agency) (6 categories) Code"]
households["pct_home_occupancy_x_obs"] = households["pct_home_occupancy"] * households["Observation"]
households["home_size_x_obs"] = households["Number of rooms (Valuation Office Agency) (6 categories) Code"] * households["Observation"]
totals = households.groupby("LSOA")[["pct_home_occupancy_x_obs", "home_size_x_obs", "Observation"]].sum().reset_index()
totals["home_size"] = totals["home_size_x_obs"] / totals["Observation"]
totals["pct_home_occupancy"] = totals["pct_home_occupancy_x_obs"] / totals["Observation"]
totals = totals[["LSOA", "home_size", "pct_home_occupancy"]]
df = df.merge(totals, on="LSOA", how="left")

In [None]:
df.shape

In [None]:
# add in building type - go for pct detatched
buildings1 = pd.read_csv("data/CTSOP_3_1_2021.csv")
buildings1 = buildings1[(buildings1.geography == "LSOA") & (buildings1.band == "All")]
buildings1 = buildings1[["ecode", "bungalow_total", "flat_mais_total", "house_terraced_total",
                         "house_semi_total", "house_detached_total", "all_properties"]]
buildings1 = buildings1.replace("-","0")

# num exposed surfaces
exposed_surfaces_per_type = {
    "bungalow_total": 5,
    "flat_mais_total": 2,
    "house_terraced_total": 3,
    "house_semi_total": 4,
    "house_detached_total": 5
}

buildings1[list(exposed_surfaces_per_type.keys())] = buildings1[exposed_surfaces_per_type.keys()].astype(int)
total_exposed_surfaces = buildings1[list(exposed_surfaces_per_type.keys())].mul(exposed_surfaces_per_type).sum(axis=1)
buildings1["home_exposed_surfaces"]  = [x / int(y) for x,y in zip(total_exposed_surfaces,  buildings1["all_properties"])]
buildings1 = buildings1[["ecode", "home_exposed_surfaces"]]
buildings1.columns = ["LSOA", "home_exposed_surfaces"]

df = df.merge(buildings1, on="LSOA", how="left")

In [None]:
df.shape

In [None]:
# add in building age
buildings2 = pd.read_csv("data/CTSOP_4_1_2021.csv")
buildings2 = buildings2[(buildings2.geography == "LSOA") & (buildings2.band == "All")]
buildings2 = buildings2.replace("-","0")

build_dates = {
    'bp_pre_1900': 1900,
    'bp_1900_1918': 1910, 
    'bp_1919_1929': 1925, 
    'bp_1930_1939': 1935, 
    'bp_1945_1954': 1950,
    'bp_1955_1964': 1960, 
    'bp_1965_1972': 1969, 
    'bp_1973_1982': 1978, 
    'bp_1983_1992': 1988,
    'bp_1993_1999': 1996, 
    'bp_2000_2008': 2004, 
    'bp_2009': 2009, 
    'bp_2010': 2010, 
    'bp_2011': 2011,
    'bp_2012': 2012, 
    'bp_2013': 2013, 
    'bp_2014': 2014, 
    'bp_2015': 2015, 
    'bp_2016': 2016, 
    'bp_2017': 2017,
    'bp_2018': 2018,
    'bp_2019': 2019,
    'bp_2020': 2020,
    'bp_2021': 2021,
    'bp_2022_2023': 2021,
    'bp_unkw': 1900 # assume if unknown then likely very old
}

buildings2[list(build_dates.keys())] = buildings2[build_dates.keys()].astype(int)
build_year = buildings2[list(build_dates.keys())].mul(build_dates).sum(axis=1)
totals = buildings2[list(build_dates.keys())].sum(axis=1)
buildings2["home_age"]  = [2021-(x / y) for x,y in zip(build_year,  totals)]
buildings2 = buildings2[["ecode", "home_age"]]
buildings2.columns = ["LSOA", "home_age"]

df = df.merge(buildings2, on="LSOA", how="left")

In [None]:
df.shape

In [None]:
# write clean file
df.columns
final_columns = ['LSOA', 'temperature','energy_cost', 'net_income', 'politically_green',
       'pct_economically_active', 'home_size', 'pct_home_occupancy',
       'home_exposed_surfaces', 'home_age', 'energy_consumption_per_person']
df = df[final_columns]
# df.to_csv("compiled_data.csv")

## 2. Analysis and data viz

In [6]:
df = pd.read_csv("compiled_data.csv")
df.shape

(33811, 13)

In [7]:
# data cleaning
df["politically_green"] = [1 if x == True else 0 for x in df.politically_green]

In [36]:
# plot map
las = gpd.GeoDataFrame.from_file("data/LAD_DEC_2021_UK_BFC.shp")
las.rename(columns={"LAD21CD":"LA"}, inplace=True)
df_energy = df.groupby("LA")["energy_consumption_per_person"].mean().reset_index()
las = las.merge(df_energy, on="LA", how="left")

In [37]:
las.energy_consumption_per_person.value_counts(dropna=F

Unnamed: 0,LA,LAD21NM,BNG_E,BNG_N,LONG,LAT,GlobalID,geometry,energy_consumption_per_person
0,E06000001,Hartlepool,447160,531474,-1.27018,54.6761,ca7f3a60-3c61-4e35-9294-bb4bf30aa27e,"MULTIPOLYGON (((450154.599 525938.201, 450140....",7204.567376
1,E06000002,Middlesbrough,451141,516887,-1.21099,54.5447,e05c5c16-f8ff-4e8e-a972-8b2821f4256a,"MULTIPOLYGON (((446854.7 517192.7, 446854.281 ...",6906.235661
2,E06000003,Redcar and Cleveland,464361,519597,-1.00608,54.5675,9946aa82-eadc-40ce-bc31-966c7f64227f,"MULTIPOLYGON (((451747.397 520561.1, 451792.20...",7136.487751
3,E06000004,Stockton-on-Tees,444940,518183,-1.30664,54.5569,a7b036a9-1a06-4206-956c-5519c3a8dd66,"MULTIPOLYGON (((447177.704 517811.797, 447176....",7101.935820
4,E06000005,Darlington,428029,515648,-1.56835,54.5353,89ab5e88-63b6-4f7b-b8dc-7618eee2867b,"POLYGON ((423496.602 524724.299, 423497.204 52...",7640.931522
...,...,...,...,...,...,...,...,...,...
369,W06000020,Torfaen,327459,200480,-3.05101,51.6984,ed5461b6-cdb4-46b1-8049-0e9216b9d00f,"POLYGON ((323898.201 211287.499, 324115.698 21...",6729.757660
370,W06000021,Monmouthshire,337812,209231,-2.90280,51.7783,a7feffe9-3fac-401c-b162-f853252c26f8,"MULTIPOLYGON (((345897.698 180999.599, 345884....",6539.809934
371,W06000022,Newport,337897,187432,-2.89769,51.5823,b457647a-1151-426a-be0b-57d4ed9d1687,"MULTIPOLYGON (((334186.001 192669.398, 334201....",6562.066450
372,W06000023,Powys,302329,273255,-3.43531,52.3486,90ff5771-3f0e-4095-96d1-e5263096c992,"MULTIPOLYGON (((270499 297829.35, 270658 29772...",5407.210594


In [None]:
las.plot(column="energy_consumption_per_person", cmap="OrRd", edgecolor="k", legend=True)

## 3. Modelling

In [None]:
# data preparation - cleaning

In [None]:
# data preparation - normalisation

In [None]:
# model

In [None]:
# inference

In [None]:
# results