# Process to assign EV

In [1]:
import numpy as np
import tensorflow as tf
import keras
import pandas as pd

In [6]:
from utils import process_from_census_data, TRS

### Import getting model

In [7]:
model = keras.models.load_model("cnn_20.keras")

### Process to get the total at each SA1
The model selected here is the one that got total SA1 quite close to the actual total - 7819

In [11]:

EXPECTED_SUM = 7819
sa1_data, sa1_tot = process_from_census_data(geo_lev="SA1", 
                                             normalise=True, 
                                             boxcox=False, 
                                             keep_same=True, 
                                             return_tot=True)

In [12]:
predicted = model.predict(sa1_data)
predicted = predicted.flatten()
sa1_data["pred_num_EV_percen"] = predicted
sa1_data["pred_num_EV_percen"]



20101100101    0.003368
20101100102    0.003313
20101100105    0.003313
20101100106    0.003377
20101100107    0.003313
                 ...   
21704148034    0.003413
21704148035    0.003313
29797979991    0.014472
29797979992    0.014472
29797979993    0.014472
Name: pred_num_EV_percen, Length: 15481, dtype: float32

In [15]:
predicted_num = sa1_data["pred_num_EV_percen"] * sa1_tot

predicted_num = predicted_num.fillna(0) #na value because that sa1 has no one
predicted_rounded = TRS(predicted_num, know_tot=EXPECTED_SUM)

print(predicted_num.sum(), predicted_rounded.sum(), EXPECTED_SUM)

sa1_data["pred_num_EV"] = predicted_rounded

print(f"""
Check whether this value has negative:
{True in (sa1_data["pred_num_EV"]<0).unique()}
""")

9899.301355244243 7819 7819

Check whether this value has negative
False



## Combine to get total EV for each SA1 in the final

### Process data

In [16]:
file_loc = "output/EV_pred"
geo_lev = "SA1"

In [33]:
df_all = pd.read_csv(f"{file_loc}/{geo_lev}_EV_pred_all.csv")
df_all[geo_lev] = df_all[geo_lev].astype(str)

In [68]:
dict_num_ev =  dict(zip(sa1_data.index.astype(str), sa1_data["pred_num_EV"]))

In [69]:
ls_cols_ev_pred = [x for x in df_all.columns if "EV_pred" in x]

In [70]:
combine_fin_df = df_all[ls_cols_ev_pred+[geo_lev]]

## Assigning EV

In [58]:
combine_fin_df

Unnamed: 0,EV_pred_forest,EV_pred_baye,EV_pred_lr,EV_pred_GraBoost,EV_pred_Pearsons,SA1
0,0.003988,0.007249,-0.006780,0.001441,0.128262,20302104732
1,0.002178,-0.002708,-0.001990,0.001441,-0.101160,20302104732
2,0.001904,-0.006483,-0.037701,0.001441,-0.239749,20302104732
3,0.003604,0.003209,0.010351,0.001441,-0.179596,20302104732
4,0.004735,0.001560,-0.013556,0.001441,-0.087372,20302104732
...,...,...,...,...,...,...
1902754,0.001483,-0.000362,0.000023,0.001441,-0.076286,21402159223
1902755,0.001297,0.003203,0.001848,0.001441,-0.042461,21402159223
1902756,0.004118,0.000006,-0.023657,0.001441,0.089287,21402159223
1902757,0.002197,0.003857,0.001653,0.001441,0.265632,21402159223


In [55]:
# Make sense to have higher, some sa1 does not have households
print(len(dict_num_ev), len(df_all[geo_lev].unique()))

15481 11964


In [80]:
dict_thres = {}
ls_df = [] # Note these one will have diff index matching with original to combine later

for zone in df_all[geo_lev].unique():
    print(f"DOING {zone}")
    dict_thres[zone] = {}
    sub_df = combine_fin_df[combine_fin_df[geo_lev]==zone]
    num_ev = dict_num_ev[zone]
    ls_series = []
    for ev_pred in ls_cols_ev_pred:
        sort_df = sub_df.sort_values(ev_pred, ascending=False)
        n = len(sort_df)
        arr_val = ["NO" for _ in range(n)]
        arr_val[:num_ev] = ["YES" for _ in range(num_ev)]
        seri_ev_count = pd.Series(arr_val, index=sort_df.index, name=f"check_{ev_pred}")
        ls_series.append(seri_ev_count)

        # Find thres
        thres = sort_df[ev_pred].iat[num_ev-1]
        dict_thres[zone][ev_pred] = thres
    df_zone = pd.concat(ls_series, axis=1) # note the index
    ls_df.append(df_zone)

In [85]:
ev_as_df = pd.concat(ls_df)
ev_as_df = ev_as_df.sort_index()
ev_as_df

Unnamed: 0,check_EV_pred_forest,check_EV_pred_baye,check_EV_pred_lr,check_EV_pred_GraBoost,check_EV_pred_Pearsons
0,NO,NO,NO,NO,NO
1,NO,NO,NO,NO,NO
2,NO,NO,NO,NO,NO
3,NO,NO,NO,NO,NO
4,NO,NO,NO,NO,NO
...,...,...,...,...,...
1902754,NO,NO,NO,NO,NO
1902755,NO,NO,NO,NO,NO
1902756,NO,NO,NO,NO,NO
1902757,NO,NO,NO,NO,NO


In [86]:
final_df_all = pd.concat([combine_fin_df, ev_as_df], axis=1)
final_df_all

Unnamed: 0,EV_pred_forest,EV_pred_baye,EV_pred_lr,EV_pred_GraBoost,EV_pred_Pearsons,SA1,check_EV_pred_forest,check_EV_pred_baye,check_EV_pred_lr,check_EV_pred_GraBoost,check_EV_pred_Pearsons
0,0.003988,0.007249,-0.006780,0.001441,0.128262,20302104732,NO,NO,NO,NO,NO
1,0.002178,-0.002708,-0.001990,0.001441,-0.101160,20302104732,NO,NO,NO,NO,NO
2,0.001904,-0.006483,-0.037701,0.001441,-0.239749,20302104732,NO,NO,NO,NO,NO
3,0.003604,0.003209,0.010351,0.001441,-0.179596,20302104732,NO,NO,NO,NO,NO
4,0.004735,0.001560,-0.013556,0.001441,-0.087372,20302104732,NO,NO,NO,NO,NO
...,...,...,...,...,...,...,...,...,...,...,...
1902754,0.001483,-0.000362,0.000023,0.001441,-0.076286,21402159223,NO,NO,NO,NO,NO
1902755,0.001297,0.003203,0.001848,0.001441,-0.042461,21402159223,NO,NO,NO,NO,NO
1902756,0.004118,0.000006,-0.023657,0.001441,0.089287,21402159223,NO,NO,NO,NO,NO
1902757,0.002197,0.003857,0.001653,0.001441,0.265632,21402159223,NO,NO,NO,NO,NO


In [87]:
check_cols = [x for x in final_df_all.columns if "check_" in x]

for c in check_cols:
    print(final_df_all[c].value_counts())

# missing some zones, high chance because the pop synthesis did not synthesize those

check_EV_pred_forest
NO     1896708
YES       6051
Name: count, dtype: int64
check_EV_pred_baye
NO     1896708
YES       6051
Name: count, dtype: int64
check_EV_pred_lr
NO     1896708
YES       6051
Name: count, dtype: int64
check_EV_pred_GraBoost
NO     1896708
YES       6051
Name: count, dtype: int64
check_EV_pred_Pearsons
NO     1896708
YES       6051
Name: count, dtype: int64


In [88]:
final_df_all.to_csv("SA1_FIN_EV_assignment.csv", index=False)

# Mapping to get the final plot

In [None]:
import matplotlib.pyplot as plt