# *Personal thermal comfort models using digital twins: Preference prediction with BIM-extracted spatial-temporal proximity data from Build2Vec*

Mahmoud Abdelrahman, Adrian Chong, Clayton Miller 2021

In [38]:
# imports
import pandas as pd
import networkx as nx
from build2vec import Build2Vec
from sklearn import model_selection
from sklearn.model_selection import train_test_split

#Import Random Forest Model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')




# 1. Build2vec graph embedding processing

In [39]:
all_spatial_data_filename = "./dataset/all_spatial_data.csv"
spatial_dataframe = pd.read_csv(all_spatial_data_filename)
df_nodes = spatial_dataframe[spatial_dataframe["id"].notna()]
df_nodes.head()

Unnamed: 0,_id,_labels,diameter_m,embedding,id,labels,location,n_levels,name,nodeType,space,space_hvac,space_type,type,x,y,_start,_end,_type
0,0.0,:Cell,,"[-0.6188916563987732,-0.37859076261520386,-0.0...",C001952-S0304,,"{""crs"":""cartesian"",""x"":158.038162231,""y"":29.19...",,,,S0304,,,cell,158.038162,29.195639,,,
1,1.0,:Cell,,"[0.342293381690979,-0.12481476366519928,-0.369...",C001953-S0304,,"{""crs"":""cartesian"",""x"":158.038162231,""y"":27.20...",,,,S0304,,,cell,158.038162,27.209225,,,
2,2.0,:Cell,,"[-0.8712330460548401,0.265268474817276,-0.4187...",C001954-S0304,,"{""crs"":""cartesian"",""x"":158.038162231,""y"":25.22...",,,,S0304,,,cell,158.038162,25.222813,,,
3,3.0,:Cell,,"[-0.9969497919082642,0.5201520323753357,0.8882...",C001955-S0304,,"{""crs"":""cartesian"",""x"":158.038162231,""y"":23.23...",,,,S0304,,,cell,158.038162,23.236399,,,
4,4.0,:Cell,,"[0.15037411451339722,-0.5603922605514526,-0.35...",C001956-S0304,,"{""crs"":""cartesian"",""x"":158.038162231,""y"":21.24...",,,,S0304,,,cell,158.038162,21.249987,,,


In [4]:
df_links_graph = pd.read_csv('./dataset/graph.csv')
emb_dimensions = 10
# Create a graph
graph = nx.from_pandas_edgelist(df_links_graph)
# Initiate build2vec
build2vec = Build2Vec(graph, dimensions=emb_dimensions, walk_length=50, num_walks=50, workers=8)
# Embed building data
model = build2vec.fit(window=50, min_count=1, batch_words=10) 

Computing transition probabilities: 100%|██████████| 4877/4877 [01:00<00:00, 80.62it/s]
Generating walks (CPU: 3): 100%|██████████| 6/6 [01:22<00:00, 13.74s/it]
Generating walks (CPU: 4): 100%|██████████| 6/6 [01:23<00:00, 13.92s/it]
Generating walks (CPU: 5): 100%|██████████| 6/6 [01:23<00:00, 13.99s/it]
Generating walks (CPU: 6): 100%|██████████| 6/6 [01:24<00:00, 14.01s/it]
Generating walks (CPU: 7): 100%|██████████| 6/6 [01:23<00:00, 13.95s/it]
Generating walks (CPU: 1): 100%|██████████| 7/7 [01:34<00:00, 13.56s/it]
Generating walks (CPU: 8): 100%|██████████| 6/6 [01:23<00:00, 13.91s/it]
Generating walks (CPU: 2): 100%|██████████| 7/7 [01:35<00:00, 13.60s/it]


In [9]:
#Save and load the embedding_vector
embedding_vector_ids = []
embedding_vector_labels = []
model.wv.save_word2vec_format("./dataset/embedding_vector")
with open("./dataset/embedding_vector", "r") as f:
    embedding_vectors_file = f.read().replace(" ", "\t")
    embedding_vector_ids = [x.split(" ")[0] for x in f.readlines()]
    with open("./dataset/embedding_vector.tsv", 'w') as file_2:
        file_2.write(embedding_vectors_file)
with open("./dataset/embedding_vector", "r") as f3:
    embedding_vector_ids = [x.split(" ")[0] for x in f3.readlines()]
for i in embedding_vector_ids:
    try:
        embedding_vector_labels.append(df_nodes[df_nodes["_id"]==float(i)][["_labels"]].reset_index(drop=True).loc[0]["_labels"])
    except:
        embedding_vector_labels.append("others")

In [10]:
emb_labels_dataframe = pd.DataFrame({"id": embedding_vector_ids, "label":embedding_vector_labels})
emb_labels_dataframe.to_csv("emb_labels.tsv", index=False,sep="\t")
emb_labels_dataframe.head()

Unnamed: 0,id,label
0,4877,:Cell
1,7585,others
2,859,:Space
3,857,:Space
4,7586,others


In [14]:
# example of extracting most similar object : 
# the node label should be in a string format, 
# top-n : the most similar 20 
model.wv.most_similar("4877",topn=20) 

[('348', 0.9992238879203796),
 ('4019', 0.9983894228935242),
 ('4648', 0.9982032179832458),
 ('386', 0.9981722831726074),
 ('376', 0.9979477524757385),
 ('4808', 0.9976978898048401),
 ('4539', 0.9975460767745972),
 ('4901', 0.997485339641571),
 ('3899', 0.997295618057251),
 ('349', 0.99728924036026),
 ('371', 0.9972671866416931),
 ('350', 0.9972372055053711),
 ('4538', 0.9971767067909241),
 ('4298', 0.9971574544906616),
 ('4178', 0.9969540238380432),
 ('4624', 0.996926486492157),
 ('4878', 0.9969204664230347),
 ('4179', 0.9967103600502014),
 ('375', 0.9966771602630615),
 ('391', 0.9966452121734619)]

# 2.Thermal comfort prediction

In [24]:
#1. load the thermal comfort dataframe
tc_dfs = pd.read_csv('./dataset/thermal_comfort_dataframe.csv')
tc_dfs.head()


Unnamed: 0,cell,skin_temp,heart_rate,thermal_vote
0,C000048-S0303,28.5,81.0,11.0
1,C000064-S0303,28.5,81.0,11.0
2,C000267-S0303,28.5,81.0,11.0
3,C000268-S0303,28.5,81.0,11.0
4,C000269-S0303,28.5,81.0,11.0


In [25]:
#2. Adding the embedding vector to each thermal comfort cell 
tc_dfs["cell_id"] = tc_dfs["cell"].apply(lambda x :df_nodes[df_nodes["id"]==x].reset_index(drop=True).loc[0]["_id"])
tc_dfs["embedding_vector"] = tc_dfs["cell_id"].apply(lambda x : model.wv.get_vector(str(int(x))))
tc_dfs.head()


Unnamed: 0,cell,skin_temp,heart_rate,thermal_vote,cell_id,embedding_vector
0,C000048-S0303,28.5,81.0,11.0,2886.0,"[-0.3900575, 0.19745928, -0.3005991, -0.621634..."
1,C000064-S0303,28.5,81.0,11.0,2902.0,"[-0.39784405, 0.16728921, -0.26784313, -0.5639..."
2,C000267-S0303,28.5,81.0,11.0,3105.0,"[-0.26166543, 0.15754583, -0.16189277, -0.5910..."
3,C000268-S0303,28.5,81.0,11.0,3106.0,"[-0.26133, 0.14430004, -0.1788513, -0.62128407..."
4,C000269-S0303,28.5,81.0,11.0,3107.0,"[-0.28050834, 0.12316196, -0.17429301, -0.5691..."


In [26]:
#3. genertae dataframe from the embedding vector of each cell
emb_df = pd.DataFrame(tc_dfs['embedding_vector'].to_list(), columns=["d"+str(i) for i in range(emb_dimensions)])
emb_df.head()


Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9
0,-0.390058,0.197459,-0.300599,-0.621634,-0.310233,0.318528,0.169181,-0.614044,-0.138116,0.715371
1,-0.397844,0.167289,-0.267843,-0.563916,-0.361983,0.377299,0.142795,-0.656179,-0.062375,0.698136
2,-0.261665,0.157546,-0.161893,-0.591034,-0.318658,0.511619,0.358967,-0.689103,0.006741,0.730071
3,-0.26133,0.1443,-0.178851,-0.621284,-0.377991,0.439737,0.244312,-0.677054,0.087656,0.765027
4,-0.280508,0.123162,-0.174293,-0.569121,-0.387678,0.485195,0.286489,-0.654261,-0.010063,0.767149


In [28]:
# 4. Preparing the final dataset 
#   . only the sking_temp and heart_rate + the embedding vector are used
dataset = tc_dfs
dataset[["d"+str(i) for i in range(emb_dimensions)]] = pd.DataFrame(dataset.embedding_vector.tolist(), index= dataset.index)


dataset = dataset[['skin_temp', 'heart_rate']+["d"+str(i) for i in range(emb_dimensions)]+['thermal_vote']][:900].sample(frac=1.0)
print(dataset["thermal_vote"].value_counts())
dataset.head()

10.0    486
11.0    371
9.0      43
Name: thermal_vote, dtype: int64


Unnamed: 0,skin_temp,heart_rate,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,thermal_vote
650,30.125,90.0,-0.591306,0.138198,-0.238531,-0.495438,-0.314748,0.244555,0.055083,-0.561186,-0.05092,0.69771,10.0
702,28.75,79.0,-0.77425,-0.254935,-0.020091,0.074677,-0.082758,-0.206936,0.400616,-0.610259,-0.091381,0.174616,10.0
724,32.25,66.0,-0.280839,-0.101759,-0.359239,-0.002332,-0.52333,0.492943,0.320687,-0.836349,0.000966,0.657948,10.0
750,31.75,86.0,-0.591102,0.223223,-0.316688,-0.653245,-0.351054,-0.038464,-0.117624,-0.519971,-0.330923,0.749631,11.0
866,30.125,85.0,-0.496121,0.346574,0.121872,0.021669,-0.057709,-0.375087,0.223664,-0.831292,-0.254106,0.759715,10.0


In [36]:
# 5. Feature/targets 
X=dataset[['skin_temp', 'heart_rate']+["d"+str(i) for i in range(emb_dimensions)]]  # Features
y=dataset[['thermal_vote']]  # Labels

print(X.head(2))
print(y.head(2))

     skin_temp  heart_rate        d0        d1        d2        d3        d4  \
650     30.125        90.0 -0.591306  0.138198 -0.238531 -0.495438 -0.314748   
702     28.750        79.0 -0.774250 -0.254935 -0.020091  0.074677 -0.082758   

           d5        d6        d7        d8        d9  
650  0.244555  0.055083 -0.561186 -0.050920  0.697710  
702 -0.206936  0.400616 -0.610259 -0.091381  0.174616  
     thermal_vote
650          10.0
702          10.0


In [40]:
# 6. Run 30 K-fold cross validation
kfold = model_selection.KFold(n_splits=30, random_state=3, shuffle=True)
rfcls=RandomForestClassifier(n_estimators=200, max_depth=220)
scoring = 'accuracy'
results = model_selection.cross_val_score(rfcls, X, y, cv=kfold, scoring=scoring)
for i in results:
    print(i)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

0.9
0.9333333333333333
0.9666666666666667
1.0
0.8333333333333334
0.9666666666666667
0.9
0.9333333333333333
0.8
0.9666666666666667
0.8666666666666667
0.8666666666666667
0.9333333333333333
1.0
0.9333333333333333
0.9333333333333333
0.9666666666666667
0.9
1.0
0.9333333333333333
0.9666666666666667
0.9333333333333333
0.9333333333333333
0.9333333333333333
0.8666666666666667
0.9
0.9
0.9333333333333333
0.9333333333333333
0.9
Accuracy: 0.924 (0.046)
