# Concatenate data and creating subsets

### Step 04
### Create Dataset

Here we create our complete dataset as well as the subsets.

Every CSV file in the specific folder of cleaned data is added. Change the file format or remove files to exclude unwanted data.

In [None]:
import glob
import pandas as pd
import numpy as np
import os

#global variables 
debug = 0 #for all (1) OR condensed output (0)

###### INPUTS
path = 'S:/Andreas/FH/Technikum/BA/'  #including slash at the end!
get = '40_Prep/'
put = '50_Datasets/'
#path = r'C:/Users/andre/Data'
###### ######

# Get data file names
temp=path+get
all_files = glob.glob(os.path.join(temp, "*.csv"))

if debug:
    print(all_files)

print(f'> Fetching data...')
# Concatenate all data into one DataFrame
all_data = pd.concat((pd.read_csv(file) for file in all_files), ignore_index=False, axis=1)
print(f'...done')
if debug:
    print(all_data.tail())
    print(all_data.dtypes)

print(f'\n> Deleting unnecessary columns...')
# Dropping all duplicate columns (e.g. UTC)
all_data = all_data.loc[:, ~all_data.columns.duplicated()]
all_data = all_data.drop('OA_station', axis=1)
print(f'...done')

print(f'\n> Converting types and creating new columns...')
# Convert all datetimes which are imported as object into datetime64(ns)
all_data['UTC'] = pd.to_datetime(all_data['UTC']) #all_data['UTC'].apply(pd.to_datetime)

# Make sure these columns are seen as numbers 
if 'OA_DD' in all_data.columns:
    all_data['OA_DD'] = all_data['OA_DD'].astype(float)
if 'OA_RF' in all_data.columns:
    all_data['OA_RF'] = all_data['OA_RF'].astype(float)
if 'OA_FFAM' in all_data.columns:
    all_data['OA_FFAM'] = all_data['OA_FFAM']*3.6 # convert to km/h

# Create new columns (differences)
all_data['CR-HF'] = all_data['CR_T'] - all_data['HF_T']
all_data['CC-HF'] = all_data['CC_T'] - all_data['HF_T']
all_data['KE-HF'] = all_data['KE_T'] - all_data['HF_T']
all_data['KW-HF'] = all_data['KW_T'] - all_data['HF_T']
all_data['SR-HF'] = all_data['SR_T'] - all_data['HF_T']

print(f'...done')

if debug:
    print(all_data.tail())
all_data.info()

### (optional) Basic statistics on all float data

In [1]:
# Using describe-method:

#example:

#percentile list
#perc = [.20, .40, .60, .80]
#dataframe.describe(perc, include, exclude, datetime_is_numeric)

# List of dtypes to include
include = ['float'] #['object', 'float', 'int']

print(f'> Describe all data columns with type(s): '+str(include))
print(all_data.describe(include=include).map(lambda x: f"{x:0.2f}"))  


# Not possible with Date as datetime:
'''
#stats = all_data.describe()
stats.loc['var'] = all_data.var().tolist()
stats.loc['skew'] = all_data.skew().tolist()
stats.loc['kurt'] = all_data.kurtosis().tolist()
print(stats)
#or
skewness = all_data.skew()
kurtosis = all_data.kurtosis()
skewness_df = pd.DataFrame({'skewness':skewness}).T
kurtosis_df = pd.DataFrame({'kurtosis':kurtosis}).T
print(skewness_df)
'''

# Alternatives to describe:
from summarytools import dfSummary
dfSummary(all_data)

# Not possible with timestamps
'''
from scipy.stats import describe
describe(all_data, axis=0)
'''

from ydata_profiling import ProfileReport
profile = ProfileReport(all_data, title="Profiling Report")
display(profile)

> Describe all data columns with type(s): ['float']


NameError: name 'all_data' is not defined

### (optional) Basic statistics on cross-check of weather data

In [18]:
# Format floating point values
#pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Run report on temperature 
print(all_data[['OA_TL', 'OG_T']].describe().map(lambda x: f"{x:0.2f}"))
dfSummary(all_data[['OA_TL', 'OG_T']])
print('\n')

# Run report on wind (speed)
print(all_data[['OA_FFAM', 'OG_W']].describe().map(lambda x: f"{x:0.2f}"))
dfSummary(all_data[['OA_FFAM', 'OG_W']])

# Check also sunshine vs. brightness?
#print(all_data[['OA_SO', 'OG_B']].describe().applymap(lambda x: f"{x:0.2f}")) 

           OA_TL       OG_T
count  359580.00  359580.00
mean       11.72      11.53
std         8.92       9.22
min       -16.10     -16.40
25%         4.60       4.10
50%        11.30      11.13
75%        18.70      18.33
max        38.30      38.40


         OA_FFAM       OG_W
count  359580.00  359580.00
mean       11.62      14.94
std         7.91      11.60
min         0.00       0.00
25%         5.40       5.52
50%         9.72      12.06
75%        16.20      22.03
max        70.92     107.30


No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,OA_FFAM [float64],Mean (sd) : 11.6 (7.9) min < med < max: 0.0 < 9.7 < 70.9 IQR (CV) : 10.8 (1.5),178 distinct values,,0 (0.0%)
2,OG_W [float64],Mean (sd) : 14.9 (11.6) min < med < max: 0.0 < 12.1 < 107.3 IQR (CV) : 16.5 (1.3),"43,268 distinct values",,0 (0.0%)


#### Exkursus:
**Out of personal experience**, the room `CR (Corridor - reading corner)` might be the room where least changes in temperature due to sealings malfunction might occur (as there is no window to open). An installed controlled living room ventilation system running 365 days/year makes manual ventilation unnecessary and in practice windows are opened once a year for cleaning.
This room (and its window) is quite remote from the wind (predominantly blowing from the North-West) due to its location towards the East. Thats why the differences between this room and the most used and exposed to wind and outside temperatures part of the house  `HF (Hall - front door)` - the front door of the building - is used for analysis.

Alternatives - as there are windows to open but quite far away from the sensor:
> `KE (Kids East)` \
> `KW (Kids West)` \
> `SR (Sleeping room)` \
> `CC (Corridor - closet)` might be too near to bathroom, where temperature levels may be strongly intertwined with usage.

### Step 05
### Feature engineering/encoding

#### #01 - Finding Coldest months for subset "cold season"
In order to find subsets of the total dataset with similar features, data shall be clustered/binned into categories. With time series analysis on expected and over the long run realized weather data, timestamps are a useful way of categorisation as seasonal patterns should emerge.  

The potential problem with sealings is found easier, if the temperature difference between indoor and outside is highest. This is accomplished by creating a category "month" and calculating the mean of outside temperature for every of these 12 months which helps filtering data. The three coldest months in the year were then chosen for this subset. In order to validate this categorisation also a clustering for coldest days was performed. An average daily outside temperature below 5°C (with few readings slightly above this value) was seen after day 326 and before day 67 (of the year), confirming that this timespan selection is useful.

In [17]:
# Check if full dataset
t = range(all_data.index.size)
t 

###### outside temperature
###### INPUTS
c = 32      #32 = oa_tl, 38 = og_t
###### ######

cn = all_data.iloc[:, c].name
# Updating dataframe to have only one column as all other columns are of no use for us at the moment 
# using .to_frame() to convert pandas series into dataframe.
#all_data[cn] = pd.to_numeric(all_data[cn])
df_short = all_data[cn].to_frame()

# Sset date column as index
df_short = df_short.set_index(pd.DatetimeIndex(pd.to_datetime(all_data.iloc[:, 0])))
df_short.sort_index(inplace=True)

#'''
# Copy df to use it for validation below
df_short_day = df_short

# Create mean temperature per month (building a category on the index)
df_short['Mth'] = df_short.index.month.astype('category')
print(df_short.groupby('Mth', sort=False, observed=False).mean())

'''
# Validate Mth with check of days below approx. 5 degrees ("toggle" multiline comment in line 20 and 28)
df_short_day['Day'] = df_short_day.index.dayofyear.astype('category')
#pd.set_option('display.max_rows', df_short_day.shape[0]+1)
print(df_short_day.groupby('Day', sort=False, observed=False).mean().head(365))
'''
# Add feature to dataframe
###### INPUTS - Based on Datetime only
mth_start = 12
mth_end = 2
###### ######
all_data['CS'] = np.where((all_data['UTC'].dt.month >= mth_start) | (all_data['UTC'].dt.month <= mth_end), 1, 0)
all_data['CS'] = all_data['CS'].astype(int)

         OA_TL
Mth           
11    6.264059
12    2.522248
1     1.352080
2     3.606060
3     6.694790
4    10.917057
5    15.198115
6    21.077391
7    22.007012
8    21.909604
9    16.769162
10   11.892156


#### #02 - Finding the windiest months
The same methodology used for finding the coldest months was applied to wind also. Hypothesis suggests that stronger wind puts more pressure to the front door increasing potential airleakages. 

Average wind speeds are highest between December and April in the used data set. But MoM (month on month) differences of average wind speed and wind direction do not change very much. Additionally this subset filtered on wind contains the coldest months, too. That's why further use of this potential subset was dismissed.

In [34]:
# Check if full dataset
t = range(all_data.index.size)
t 

# Wind
###### INPUTS
c = 40     #OG_W = 40; FFAM (m/s) = 24; DD = 22
###### ######
cn = all_data.iloc[:, c].name
# Updating dataFrame to have only one column as all other columns are of no use for us at the moment 
# Using .to_frame() to convert pandas series into dataframe.
#all_data[cn] = pd.to_numeric(all_data[cn])
df_short = all_data[cn].to_frame()

# Set date column as index
df_short = df_short.set_index(pd.DatetimeIndex(pd.to_datetime(all_data.iloc[:, 0])))
df_short.sort_index(inplace=True)

# Building a category on the index
df_short['Mth'] = df_short.index.month.astype('category') 
print(df_short.groupby('Mth', sort=False, observed=False).mean())

          OG_W
Mth           
11   14.913922
12   16.834112
1    16.709767
2    17.800649
3    16.206456
4    17.045428
5    15.095884
6    13.390035
7    12.813406
8    11.898219
9    12.215368
10   14.231702


#### #03 - (optional) Creating additional columns
Not a basic necessary step, but potentially useful for future in-depth analyses.

In [18]:
print(f'> Creating additional columns...')
all_data['OG-OA'] = all_data['OG_T'] - all_data['OA_TL']
all_data['OG-OA_W'] = all_data['OG_W'] - all_data['OA_FFAM']
print(f'...done')

> Creating additional columns...
...done


#### #04 - Difference normalization and adding column
Observed temperature differences of specific locations in the house serve as an indicator for airtightness. To find a trend more easily (at a later stage), the mean of difference in the first year of observations shall serve as the "neutral level" to start from. Therefore we calculate this number and add/subtract it from every observation in the datapoint, creating a new column "..._n".

In [26]:
###### INPUTS for all difference columns, e.g. KE-HF
c = 55
period = 52596
###### ######

cn = all_data.iloc[:, c].name
# Updating dataFrame to have only one column as all other columns are of no use for us at the moment 
# Using .to_frame() to convert pandas series into dataframe.
all_data[cn] = pd.to_numeric(all_data[cn])
df_short = all_data[cn].to_frame()

m = df_short.head(period).mean()
print(m)
cnn = cn + '_n'
all_data[cnn] = all_data[cn].apply(lambda x: x - m)
print(f'> New column "'+cnn+'" added!')
print(all_data[cnn])

OG-OA_W    4.440702
dtype: float64
> New column "OG-OA_W_n" added!
0         2.761298
1        -0.153702
2         4.470298
3         6.315298
4         5.828298
            ...   
359575   -0.556302
359576    9.487698
359577    6.071298
359578    6.179298
359579    5.035298
Name: OG-OA_W_n, Length: 359580, dtype: float64


#### Excursus: List of columns if necessary

In [None]:
# List of columns
#i = 0
#for col in all_data.columns:
#    print(str(i)+': '+col)
#    i += 1

all_data.info()

### Step 06 
### Creating subsets

In [27]:
from datetime import datetime

# Create subsets
print(f'> Creating subsets:')

###### INPUTS - Various Parameters
#wind direction in degrees, speed, temperature
wind_h = 320.0 #max, lower than...
wind_l = 220.0 #min, higher than...
windspeed_l = 0.0 #min, higher than in m/s
temp_h = 5.0 #max, lower than...
# taking out missing data 
time_l = datetime(2018, 12, 1, 0, 10) #'12/01/2018 00:10'
time_h = datetime(2018, 11, 18, 9, 30) #'11/18/2018 09:30'
###### ######

'''
df_co = all_data[(all_data['OA_DD'] >= wind_l) & (all_data['OA_DD'] <= wind_h) & (all_data['OA_FFAM'] >= windspeed_l) & (all_data['OA_TL'] <= temp_h) & ((all_data.iloc[:, 0] <= time_h) | (all_data.iloc[:, 0] >= time_l))]
print(f'Subset "co" created with ' +str(df_co.shape[0])+ ' rows')
if debug:
    print(f'Describe Subset "co" data...')
    print(df_co.describe(include=include))

#subset1a = all_data[(all_data['AG_Temp'] <= temp_h)]
#print(f'Subset 1a created with ' +str(subset1a.shape[0])+ ' rows')
'''
df_cs = all_data[(all_data.iloc[:, 0].dt.month >= mth_start) | (all_data.iloc[:, 0].dt.month <= mth_end)] 
print(f'Subset "cs" created with ' +str(df_cs.shape[0])+ ' rows')
if debug:
    print(f'Describe Subset "cs" data...')
    print(df_cs.describe(include=include))

> Creating subsets:
Subset "co" created with 32884 rows
Subset "cs" created with 90864 rows


In [28]:
df_cs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90864 entries, 2357 to 330820
Data columns (total 63 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   UTC            90864 non-null  datetime64[ns]
 1   BA_T           90864 non-null  float64       
 2   BA_T_FLAG      90864 non-null  int64         
 3   CC_T           90864 non-null  float64       
 4   CC_T_FLAG      90864 non-null  int64         
 5   CR_T           90864 non-null  float64       
 6   CR_T_FLAG      90864 non-null  int64         
 7   DR_T           90864 non-null  float64       
 8   DR_T_FLAG      90864 non-null  int64         
 9   HB_T           90864 non-null  float64       
 10  HB_T_FLAG      90864 non-null  int64         
 11  HF_T           90864 non-null  float64       
 12  HF_T_FLAG      90864 non-null  int64         
 13  KE_T           90864 non-null  float64       
 14  KE_T_FLAG      90864 non-null  int64         
 15  KW_T           90864

### Save total or subsets of data

In [None]:
# Switch between datasets
df = all_data  #all_data  #df_co  #df_cs

# Potentially reduce columns
#df = df.loc[:,~df.columns.str.startswith('OA')]  #with copy
#df.drop(list(df.filter(regex = '_FLAG')), axis = 1, inplace = True) #without copy


if (len(df.index) < 359580):
    file = path+put+'df_cx_'+str(df.shape[0])+'.csv'
    df.to_csv(file, sep=',', index=False, encoding='utf-8')
else:
    file = path+put+'df_al.csv'
    all_data.to_csv(file, sep=',', index=False, encoding='utf-8')
print(f'> Export to \'' + file + '\' successful')