# Experimenting with possible approaches

In [1]:
!pip install -r ../requirements.txt

[0m

## Imports

In [2]:
import numpy as np
import pandas as pd
import pymc as pm
import geopy.distance

## 1. Data compilation

In [3]:
# assumptions
# source - https://www.icaew.com/insights/viewpoints-on-the-news/2022/sept-2022/chart-of-the-week-energy-price-cap-update
GAS_PRICE_PER_KWH = 3.3
ELECTRIC_PRICE_PER_KWH = 19.0 

In [4]:
# look at the headline dataset of consumption by LSOA
main_data = pd.read_csv("../data/raw/LSOA Energy Consumption Data.csv")

In [5]:
main_data.shape

(33811, 21)

In [6]:
main_data['Lower Layer Super Output Area (LSOA) Code'].nunique()

33811

In [7]:
# look at household size data
household_size = pd.read_csv("../data/raw/RM202-Household-Size-By-Number-Of-Rooms-2021-lsoa-ONS.csv")

In [8]:
household_size["Number of rooms (Valuation Office Agency) (6 categories) Code"].unique()

array([1, 2, 3, 4, 5, 6])

In [9]:
household_size[household_size["Number of rooms (Valuation Office Agency) (6 categories) Code"] == 4]["Household size (5 categories) Code"].unique()

array([0, 1, 2, 3, 4])

In [10]:
building_age = pd.read_csv("../data/raw/CTSOP_4_1_2021.csv")

In [11]:
# Library to work with netCDF files
from netCDF4 import Dataset

file_name = "../data/raw/tas_hadukgrid_uk_60km_ann_202101-202112.nc"
file_id = Dataset(file_name)

latitude = file_id.variables["latitude"][:,:]
longitude = file_id.variables["longitude"][:,:]
temps = file_id.variables["tas"][:,:]

lats = [np.mean(x) for x in latitude]
longs = [np.mean(x) for x in longitude] 
ts = [np.mean(x) for x in temps[0]]
temp_data = pd.DataFrame({"latitude": lats,
                          "longitude": longs,
                          "temperature": ts}
                        )

temp_data = temp_data[temp_data.temperature > 0]

### Combining and generating features

In [12]:
# feature generation
main_data["pct_electric"] = main_data['Electricity Consumption (kWh)'] / main_data['Total Energy Consumption (kWh)']
main_data["coords"] = [(lat, long) for lat, long in zip(main_data.Latitude, main_data.Longitude)]

In [13]:
df = main_data[['Local Authority Name', 'Local Authority Code', 'MSOA Name',
       'Middle Layer Super Output Area (MSOA) Code', 'LSOA Name',
       'Lower Layer Super Output Area (LSOA) Code', 'coords',
       'pct_electric', 'Average Energy Consumption per Person (kWh)']]

In [14]:
df.columns = ['LA_name', 'LA', 'MSOA_ame',
       'MSOA', 'LSOA_name',
       'LSOA', 'coords',
       'pct_electric', 'energy_consumption_per_person']

In [15]:
# add temperature data
coords =  [(lat, long) for lat, long in zip(temp_data.latitude, temp_data.longitude)]
temp_dict = {co:t for co,t in zip(coords, temp_data.temperature)}

def find_closest_temp_measurement(this_point):
    return temp_dict[min(temp_dict.keys(), key=lambda x: geopy.distance.geodesic(this_point, x))]

df["temperature"] = [find_closest_temp_measurement(x) for x in df.coords]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temperature"] = [find_closest_temp_measurement(x) for x in df.coords]


In [16]:
# compute energy cost
df["energy_cost"] = [ELECTRIC_PRICE_PER_KWH * x + GAS_PRICE_PER_KWH * (1-x) for x in df["pct_electric"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["energy_cost"] = [ELECTRIC_PRICE_PER_KWH * x + GAS_PRICE_PER_KWH * (1-x) for x in df["pct_electric"]]


In [17]:
# add income data
income_data = pd.read_csv("../data/raw/net_income_after_housing_costs.csv")
income_data = income_data[["MSOA code", "Net annual income after housing costs (£)"]].copy()
income_data.columns = ["MSOA", "net_income"]
df = df.merge(income_data, on="MSOA", how="left")

In [18]:
df.shape

(33811, 12)

In [19]:
# add green data
voting_data = pd.read_csv("../data/raw/CBP09228_detailed_results_England_elections.csv")
voting_data["pct_green"] = voting_data["Green"] / voting_data["Total"]
voting_data["green_council"] = voting_data["pct_green"] >= 0.1
voting_data = voting_data[["ONS code", "green_council"]].copy()
voting_data.columns = ["LA", "politically_green"]
df = df.merge(voting_data, on="LA", how="left")

In [20]:
df.shape

(33811, 13)

In [21]:
# add employment status
economic_activity = pd.read_csv("../data/raw/economic_activity.csv")
economic_activity = economic_activity[["Area code", "Economically active: \nIn employment \n(including full-time students), \n2021\n(percent)"]]
economic_activity.columns = ["LA", "pct_economically_active"]
df = df.merge(economic_activity, on="LA", how="left")

In [22]:
df.shape

(33811, 14)

In [23]:
# add in building data
households = pd.read_csv("../data/raw/RM202-Household-Size-By-Number-Of-Rooms-2021-lsoa-ONS.csv")

In [24]:
households.rename(columns={"Lower layer Super Output Areas Code": "LSOA"}, inplace=True)

In [26]:
buildings1 = pd.read_csv("../data/raw/CTSOP_3_1_2021.csv")

In [27]:
buildings1

Unnamed: 0,geography,ba_code,ecode,area_name,band,bungalow_1,bungalow_2,bungalow_3,bungalow_4,bungalow_5,...,house_detached_3,house_detached_4,house_detached_5,house_detached_6,house_detached_unkw,house_detached_total,annexe,caravan_houseboat_mobilehome,unknown,all_properties
0,ENGWAL,,K04000001,ENGLAND AND WALES,All,282020,1201560,803420,144070,22830,...,1542460,1919150,405390,130620,21240,4214440,47540,120150,191250,26336420
1,ENGWAL,,K04000001,ENGLAND AND WALES,A,180470,144340,17590,870,80,...,6430,1760,390,300,350,16960,41120,114850,55490,6208070
2,ENGWAL,,K04000001,ENGLAND AND WALES,B,62550,270780,50690,3510,350,...,37800,4510,690,390,410,62790,4090,3990,27680,5171280
3,ENGWAL,,K04000001,ENGLAND AND WALES,C,26520,404230,201400,17000,1970,...,336180,51200,3830,1080,1410,437110,1390,880,29840,5755600
4,ENGWAL,,K04000001,ENGLAND AND WALES,D,8170,246320,263150,29970,3520,...,522920,377270,17870,3210,3090,979590,580,200,24780,4105070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392153,UNMD,,UNMATCHED,UNMATCHED,E,-,20,120,10,-,...,150,230,40,10,-,450,0,0,20,860
392154,UNMD,,UNMATCHED,UNMATCHED,F,0,10,30,20,-,...,60,190,70,10,-,340,0,0,20,550
392155,UNMD,,UNMATCHED,UNMATCHED,G,-,0,10,10,-,...,30,90,90,30,-,240,0,0,10,350
392156,UNMD,,UNMATCHED,UNMATCHED,H,0,0,0,-,-,...,-,10,10,30,-,50,0,0,10,80


In [28]:
buildings2 = pd.read_csv("../data/raw/CTSOP_4_1_2021.csv")

In [30]:
buildings2.columns

Index(['geography', 'ba_code', 'ecode', 'area_name', 'band', 'bp_pre_1900',
       'bp_1900_1918', 'bp_1919_1929', 'bp_1930_1939', 'bp_1945_1954',
       'bp_1955_1964', 'bp_1965_1972', 'bp_1973_1982', 'bp_1983_1992',
       'bp_1993_1999', 'bp_2000_2008', 'bp_2009', 'bp_2010', 'bp_2011',
       'bp_2012', 'bp_2013', 'bp_2014', 'bp_2015', 'bp_2016', 'bp_2017',
       'bp_2018', 'bp_2019', 'bp_2020', 'bp_2021', 'bp_2022_2023', 'bp_unkw',
       'all_properties'],
      dtype='object')

## 2. Analysis

## 3. Modelling