# Creating input files for the hctsa using meta data

In [11]:
import pandas as pd
import seaborn as sns
import numpy as np
import os


In [12]:
%matplotlib inline

In [13]:
repos_path = "/Users/Clayton/temporal-features-for-buildings/"

In [14]:
meta = pd.read_csv(os.path.join(repos_path,"data/raw/meta_open_withperformanceclasses.csv"), index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)

In [15]:
temp = pd.read_csv((os.path.join(repos_path,"data/interim/temp_open_utc_complete.csv")), index_col='timestamp', parse_dates=True).tz_localize('utc')

In [16]:
meta.head()

Unnamed: 0_level_0,dataend,datastart,energystarscore,heatingtype,industry,mainheatingtype,numberoffloors,occupants,primaryspaceusage,rating,...,sqm,subindustry,timezone,yearbuilt,nickname,primaryspaceuse_abbrev,newweatherfilename,dailymeancons,usagecategory,operationsgroup
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PrimClass_Everett,2012-12-31 23:00:00,2012-01-01,,,Education,,,,Primary/Secondary Classroom,,...,9804.05359,Primary/Secondary School,America/New_York,,Everett,PrimClass,weather12.csv,0.005935,Low,Misc
UnivClass_Clifford,2015-12-31 23:00:00,2015-01-01,,,Education,,,,College Classroom,,...,5292.591007,College/University,America/New_York,1967.0,Clifford,UnivClass,weather2.csv,0.006464,Low,Group3
Office_Elizabeth,2012-12-31 23:00:00,2012-01-01,,,Commercial Property,,,,Office,,...,27373.96185,Commercial Real Estate,America/Los_Angeles,,Elizabeth,Office,weather22.csv,0.007931,Low,Misc
Office_Ellie,2012-12-31 23:00:00,2012-01-01,,,Commercial Property,,,,Office,,...,46127.91885,Bank/Financial Services,America/Los_Angeles,,Ellie,Office,weather28.csv,0.008237,Low,Misc
PrimClass_Elisabeth,2012-12-31 23:00:00,2012-01-01,,,Education,,,,Primary/Secondary Classroom,,...,21652.15899,Primary/Secondary School,America/New_York,,Elisabeth,PrimClass,weather23.csv,0.008597,Low,Misc


# First we make the time series input files

In [17]:
meta_tooutput = pd.DataFrame(meta.primaryspaceuse_abbrev+","+meta.usagecategory+","+meta.operationsgroup)

In [18]:
meta_tooutput.head()

Unnamed: 0_level_0,0
uid,Unnamed: 1_level_1
PrimClass_Everett,"PrimClass,Low,Misc"
UnivClass_Clifford,"UnivClass,Low,Group3"
Office_Elizabeth,"Office,Low,Misc"
Office_Ellie,"Office,Low,Misc"
PrimClass_Elisabeth,"PrimClass,Low,Misc"


In [19]:
meta_tooutput['outputfile'] = meta_tooutput.index+".dat"

In [20]:
meta_tooutput.index = meta_tooutput.outputfile

In [21]:
meta_tooutput.head()

Unnamed: 0_level_0,0,outputfile
outputfile,Unnamed: 1_level_1,Unnamed: 2_level_1
PrimClass_Everett.dat,"PrimClass,Low,Misc",PrimClass_Everett.dat
UnivClass_Clifford.dat,"UnivClass,Low,Group3",UnivClass_Clifford.dat
Office_Elizabeth.dat,"Office,Low,Misc",Office_Elizabeth.dat
Office_Ellie.dat,"Office,Low,Misc",Office_Ellie.dat
PrimClass_Elisabeth.dat,"PrimClass,Low,Misc",PrimClass_Elisabeth.dat


In [22]:
meta_tooutput = meta_tooutput[0]

In [23]:
#meta_tooutput

In [24]:
meta_tooutput.to_csv("/Users/Clayton/hctsa/TimeSeries/INP_buildinggenome.txt",sep=" ", header=False)

Make temp files

In [25]:
buildinglist = list(meta.index)

In [26]:
# for buildingname in buildinglist:
#     print buildingname
#     temp[buildingname].dropna().to_csv("/Users/Clayton/hctsa/TimeSeries/"+buildingname+".dat",header=False, index=False)

# Make files that are similar to the output files from the hctsa to create the same visualizations

In [52]:
features = pd.read_csv(os.path.join(repos_path, "data/processed/ALLFEATURES_meta_labelled.CSV"), index_col="feature_name")

In [53]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 315 entries, BG_all_meanvs95_max to BG_consumpstats_t90t
Data columns (total 4 columns):
feature_type                315 non-null object
usetype_applicable          315 non-null object
constype_applicable         315 non-null object
operationtype_applicable    315 non-null object
dtypes: object(4)
memory usage: 12.3+ KB


In [54]:
features.T.columns

Index([u'BG_all_meanvs95_max', u'BG_all_meanvs95_mean', u'BG_all_meanvs95_min',
       u'BG_all_meanvs95_std', u'BG_all_meanvsmax_max',
       u'BG_all_meanvsmax_mean', u'BG_all_meanvsmax_min',
       u'BG_all_meanvsmax_std', u'BG_all_minvs95_max', u'BG_all_minvs95_mean',
       ...
       u'BG_consumpstats_kwtotalMar', u'BG_consumpstats_kwtotalMay',
       u'BG_consumpstats_kwtotalNov', u'BG_consumpstats_kwtotalOct',
       u'BG_consumpstats_kwtotalSep', u'BG_consumpstats_kwvarsummer',
       u'BG_consumpstats_kwvarwinter', u'BG_consumpstats_maxhrdate',
       u'BG_consumpstats_t10t', u'BG_consumpstats_t90t'],
      dtype='object', name=u'feature_name', length=315)

In [55]:
operations = pd.DataFrame(features.T.columns)

In [56]:
operations.head()

Unnamed: 0,feature_name
0,BG_all_meanvs95_max
1,BG_all_meanvs95_mean
2,BG_all_meanvs95_min
3,BG_all_meanvs95_std
4,BG_all_meanvsmax_max


In [57]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 315 entries, BG_all_meanvs95_max to BG_consumpstats_t90t
Data columns (total 4 columns):
feature_type                315 non-null object
usetype_applicable          315 non-null object
constype_applicable         315 non-null object
operationtype_applicable    315 non-null object
dtypes: object(4)
memory usage: 12.3+ KB


In [58]:
operations.index = features.index
operations['keywords'] = features.feature_type+","+features.usetype_applicable+","+features.constype_applicable+","+features.operationtype_applicable

In [59]:
operations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 315 entries, BG_all_meanvs95_max to BG_consumpstats_t90t
Data columns (total 2 columns):
feature_name    315 non-null object
keywords        315 non-null object
dtypes: object(2)
memory usage: 7.4+ KB


In [60]:
# operations = pd.DataFrame(operations.str.replace('[^\w\s]',''))

In [64]:
#features.feature_type+","+features.usetype_applicable+","+features.constype_applicable+","+features.operationtype_applicable

In [67]:
operations.index.name = "index"
operations = operations.reset_index()

In [68]:
operations.head()

Unnamed: 0,index,feature_name,keywords
0,BG_all_meanvs95_max,BG_all_meanvs95_max,"dailyratios,usetype,constype,operationtype"
1,BG_all_meanvs95_mean,BG_all_meanvs95_mean,"dailyratios,usetype,constype,operationtype"
2,BG_all_meanvs95_min,BG_all_meanvs95_min,"dailyratios,usetype,constype,operationtype"
3,BG_all_meanvs95_std,BG_all_meanvs95_std,"dailyratios,usetype,constype,operationtype"
4,BG_all_meanvsmax_max,BG_all_meanvsmax_max,"dailyratios,usetype,constype,operationtype"


In [69]:
operations.to_csv("/Users/Clayton/hctsa/Database/INP_allbg_ops.txt", header=False, index=False, sep=" ")

In [70]:
operations.index.name = "features"

In [71]:
masteroperations = pd.DataFrame(operations.feature_name)

In [72]:
masteroperations = masteroperations.reset_index()
masteroperations = masteroperations.reset_index()

In [73]:
masteroperations[0] = masteroperations["index"].apply(lambda x: "AllBGOperations(y,"+str(x)+")")

In [74]:
#masteroperations

In [75]:
masteroperations = masteroperations.drop(['features','index'],axis=1)

In [76]:
masteroperations = masteroperations[[0,'feature_name']]

In [77]:
masteroperations.head()

Unnamed: 0,0,feature_name
0,"AllBGOperations(y,0)",BG_all_meanvs95_max
1,"AllBGOperations(y,1)",BG_all_meanvs95_mean
2,"AllBGOperations(y,2)",BG_all_meanvs95_min
3,"AllBGOperations(y,3)",BG_all_meanvs95_std
4,"AllBGOperations(y,4)",BG_all_meanvsmax_max


In [78]:
masteroperations.to_csv("/Users/Clayton/hctsa/Database/INP_allbg_mops.txt", header=False, index=False, sep=" ")

# Create a dummy `TS_Quality` matrix

In [144]:
TS_Quality = pd.DataFrame(0, index=np.arange(507), columns=np.arange(315))

In [157]:
TS_Quality.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,305,306,307,308,309,310,311,312,313,314
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
TS_Quality.to_csv("/Users/Clayton/hctsa/TS_QualityDummy.txt", header=False, index=False)

# Add Features Matrix to hctsa home folder

In [149]:
features_data = pd.read_csv(os.path.join(repos_path, "data/processed/ALLFEATURES.CSV"), index_col='Unnamed: 0')

In [152]:
features_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, Office_Abbey to UnivLab_Tracy
Columns: 315 entries, BG_all_meanvs95_max to BG_weekend_minvsmax_std
dtypes: float64(315)
memory usage: 1.2+ MB


In [153]:
features_data.head()

Unnamed: 0,BG_all_meanvs95_max,BG_all_meanvs95_mean,BG_all_meanvs95_min,BG_all_meanvs95_std,BG_all_meanvsmax_max,BG_all_meanvsmax_mean,BG_all_meanvsmax_min,BG_all_meanvsmax_std,BG_all_minvs95_max,BG_all_minvs95_mean,...,BG_weekend_meanvsmax_min,BG_weekend_meanvsmax_std,BG_weekend_minvs95_max,BG_weekend_minvs95_mean,BG_weekend_minvs95_min,BG_weekend_minvs95_std,BG_weekend_minvsmax_max,BG_weekend_minvsmax_mean,BG_weekend_minvsmax_min,BG_weekend_minvsmax_std
Office_Abbey,0.991726,0.830303,0.553791,0.087306,0.989658,0.813128,0.549331,0.083906,0.981362,0.467952,...,0.657953,0.074334,0.844232,0.548659,0.346082,0.112044,0.819727,0.529841,0.341704,0.107213
Office_Abigail,0.980135,0.681228,0.416167,0.110767,0.975904,0.633933,0.352372,0.114302,0.949884,0.463794,...,0.371951,0.109452,0.949884,0.576688,0.298951,0.171996,0.945783,0.538858,0.278862,0.171697
Office_Al,0.976661,0.736467,0.504575,0.098615,0.953896,0.719658,0.497427,0.085458,0.899811,0.51581,...,0.633915,0.068949,0.899811,0.682016,0.503914,0.100446,0.872625,0.653404,0.486708,0.103666
Office_Alannah,0.967693,0.67626,0.399966,0.109274,0.956716,0.633928,0.3242,0.119918,0.783483,0.364965,...,0.40311,0.125101,0.783483,0.369298,0.116253,0.117151,0.780458,0.353199,0.107352,0.113517
Office_Aliyah,0.99113,0.905865,0.813399,0.042771,0.989721,0.899784,0.807132,0.042884,0.964982,0.780943,...,0.905349,0.013881,0.914196,0.872756,0.738899,0.025416,0.908976,0.867061,0.737593,0.025648


In [154]:
features_data.to_csv("/Users/Clayton/hctsa/ALLFEATURES.txt", header=False, index=False)

In [156]:
features_data.columns

Index([u'BG_all_meanvs95_max', u'BG_all_meanvs95_mean', u'BG_all_meanvs95_min',
       u'BG_all_meanvs95_std', u'BG_all_meanvsmax_max',
       u'BG_all_meanvsmax_mean', u'BG_all_meanvsmax_min',
       u'BG_all_meanvsmax_std', u'BG_all_minvs95_max', u'BG_all_minvs95_mean',
       ...
       u'BG_weekend_meanvsmax_min', u'BG_weekend_meanvsmax_std',
       u'BG_weekend_minvs95_max', u'BG_weekend_minvs95_mean',
       u'BG_weekend_minvs95_min', u'BG_weekend_minvs95_std',
       u'BG_weekend_minvsmax_max', u'BG_weekend_minvsmax_mean',
       u'BG_weekend_minvsmax_min', u'BG_weekend_minvsmax_std'],
      dtype='object', length=315)

In [158]:
masteroperations

Unnamed: 0,0,feature_name
0,"AllBGOperations(y,0)",BG_all_meanvs95_max
1,"AllBGOperations(y,1)",BG_all_meanvs95_mean
2,"AllBGOperations(y,2)",BG_all_meanvs95_min
3,"AllBGOperations(y,3)",BG_all_meanvs95_std
4,"AllBGOperations(y,4)",BG_all_meanvsmax_max
5,"AllBGOperations(y,5)",BG_all_meanvsmax_mean
6,"AllBGOperations(y,6)",BG_all_meanvsmax_min
7,"AllBGOperations(y,7)",BG_all_meanvsmax_std
8,"AllBGOperations(y,8)",BG_all_minvs95_max
9,"AllBGOperations(y,9)",BG_all_minvs95_mean
