In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 13 11:45:27 2022

@author: asimt
"""
# libraries
channel_paths = 'House_2/'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime
from datetime import timedelta, date
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
def apply_kmeans(column):
    """
    This method takes channel readings column as input and applies K-Means clustering algorithm
    with 2 clusters - On/Off.

    Input:
    column = 1-d array of readings

    Output:
    x = original column but reshaped
    km = kmeans object

    """
    x = np.array(column)
    km = KMeans(n_clusters=2)
    res = km.fit(x.reshape(-1, 1))
    return x, km


def get_clusters(x, km, timeindex):
    """
    This method returns clusters resulted from the K-Means algorithm.

    Input:
    x = Readings array
    km = K-Means algo object
    timeindex = list of timestamps

    Output:
    cluster_1 = Cluster of timestamps when device is Off
    cluster_2 = Cluster of timestamps when device is On
    times = Array of On/Off sequence for an appliance

    """
    times_1 = []
    times_2 = []
    cluster_1 = []
    cluster_2 = []
    for i in range(len(km.labels_)):
        if km.labels_[i] == 0:
            cluster_1.append(x[i])
            times_1.append("0")
            times_2.append(str(timeindex[i]))
        else:
            cluster_2.append(x[i])
            times_1.append(str(timeindex[i]))
            times_2.append("0")

    if cluster_1[0] < cluster_2[0]:
        return cluster_1, cluster_2, times_1
    if cluster_1[0] > cluster_2[0]:
        return cluster_2, cluster_1, times_2



def get_ONOFF_data(channel_paths):
    channel_status_dict = dict()
    cp = []
    for p in os.listdir(channel_paths):
        if 'channel' in p and p!= 'channel_1.dat':
            p = 'House_2/' + p
            channel = pd.read_csv(p,sep="\\s+",names=["Timestamp", "Readings"],parse_dates=["Timestamp"],header=0,)
            channel = channel.set_index('Timestamp')
            channel.index = pd.to_datetime(channel.index,unit='s')
            # resample
            channel = channel.resample('2min')
            channel = channel.mean()
            channel = channel.reset_index()
            # removing any nulls
            channel = channel.fillna(0)
            # get on off data
            x, km = apply_kmeans(channel["Readings"])
            # gets dates for the devices are on, zero means device is OFF
            cluster_1, cluster_2, times = get_clusters(
            x, km, channel.Timestamp)       
            # get channel name
            channel_name = p.split("/")[-1][:-4]
            channel_status_dict[channel_name] = times
    return channel_status_dict
            
        
# read main data
def get_mains_resampled_data(path):
    mains_df = pd.read_csv(
        path,
        sep="\\s+",
        names=["Timestamp", "Reading_1", "Reading_2", "Reading_3"],
        parse_dates=["Timestamp"],
        header=0,
    )
    mains_df = mains_df.set_index("Timestamp")
    mains_df.index = pd.to_datetime(mains_df.index, unit="s")
    mains_df = mains_df.resample('2min')
    mains_df = mains_df.mean()
    mains_df = mains_df.reset_index()
    # removing any nulls
    mains_df = mains_df.fillna(0)
    return mains_df
    
def concatenate_mains_to_channel(channel,mains_date):
    channel_status = []
    for datetime in mains_date:
        if datetime in channel:
            channel_status.append(1)
        else:
            channel_status.append(0)
    return channel_status
            

def BintoDec(x):
    return(int(x, 2))   
    
def DecToBin(target, length):
    target_len = "{0:0" + str(length) + "b}"
    return [int(target) for target in list(target_len.format(target))]

# read labels


    
# reshape Df for Classification
def reshapeDF(df):
    df = df.astype(str)
    df['bin'] = df.values.sum(axis=1)
    #bin to dec
    df['target']= df.apply(lambda row: BintoDec(row.bin), axis=1)
    return df

def evaluateDissagregation(predicted_df, actual_df):
    results=[]
    for column in actual_df:
        dict_app={}
        dict_app["Appliance"]= column
        dict_app['Accuracy'] = accuracy_score(actual_df[column], predicted_df[column]) 
        dict_app['Precision'] = precision_score(actual_df[column], predicted_df[column], average="macro")
        dict_app['Recall'] = recall_score(actual_df[column], predicted_df[column], average="macro")
        dict_app['F1'] = f1_score(actual_df[column], predicted_df[column], average="macro")
        results.append(dict_app)    
    return results



In [2]:
channel_status_dict= get_ONOFF_data(channel_paths)         



In [3]:
main_df = get_mains_resampled_data('House_2/mains.dat')
main_df = main_df.set_index("Timestamp")

main_df['weekday'] = main_df.index.dayofweek
main_df['hour'] = main_df.index.hour
main_df['second'] = main_df.index.second
main_df['month'] = main_df.index.month
main_df['quarter'] = main_df.index.quarter

mains_date  = main_df.index.astype('str')


In [4]:
newdict = dict()
for channels in channel_status_dict.keys():
    newdict[channels] =concatenate_mains_to_channel(channel_status_dict[channels],mains_date)
    


In [5]:
full_df = pd.DataFrame(newdict, index = mains_date)

In [6]:
full_df.to_csv('full.csv')

In [7]:
df = reshapeDF(full_df)

In [8]:
df

Unnamed: 0_level_0,channel_10,channel_11,channel_12,channel_13,channel_14,channel_15,channel_16,channel_17,channel_18,channel_19,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,channel_9,bin,target
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-04-16 20:44:00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,000000000000101000,40
2013-04-16 20:46:00,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,000000000010101000,168
2013-04-16 20:48:00,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,000000000011101000,232
2013-04-16 20:50:00,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,000000000011101000,232
2013-04-16 20:52:00,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,000000000011101000,232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-10-10 05:08:00,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,000000001000000000,512
2013-10-10 05:10:00,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,000000001000000000,512
2013-10-10 05:12:00,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,000000001000000000,512
2013-10-10 05:14:00,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,000000001000000000,512


In [9]:
# predict channels df target using main as feature set
X_train, X_test, y_train, y_test = train_test_split(main_df,df["target"], test_size=0.5, random_state=None, shuffle=False)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(63488, 8)
(63489, 8)
(63488,)
(63489,)


In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs =-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
channel_list = ['channel_10',
 'channel_11',
 'channel_12',
 'channel_13',
 'channel_14',
 'channel_15',
 'channel_16',
 'channel_17',
 'channel_18',
 'channel_19',
 'channel_2',
 'channel_3',
 'channel_4',
 'channel_5',
 'channel_6',
 'channel_7',
 'channel_8',
 'channel_9']

In [12]:
pred_df= pd.DataFrame({"target":y_pred}, index=y_test.index)
pred_df["Appliance_Binary_data"]=pred_df.apply(lambda row: DecToBin(row.target, len(channel_list)), axis=1)
predicted_df=pd.DataFrame(pred_df["Appliance_Binary_data"].to_list(), columns=channel_list, index=pred_df.index)

In [13]:
act_df= pd.DataFrame({"target":y_test}, index=y_test.index)
act_df["Appliance_Binary_data"]=act_df.apply(lambda row: DecToBin(row.target, len(channel_list)), axis=1)
actual_df=pd.DataFrame(act_df["Appliance_Binary_data"].to_list(), columns=channel_list, index=act_df.index)


In [14]:
result = evaluateDissagregation(predicted_df, actual_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
pd.DataFrame(result)

Unnamed: 0,Appliance,Accuracy,Precision,Recall,F1
0,channel_10,0.996976,0.498488,0.5,0.499243
1,channel_11,0.84654,0.42327,0.5,0.458447
2,channel_12,0.997543,0.498771,0.5,0.499385
3,channel_13,0.984706,0.492353,0.5,0.496147
4,channel_14,0.638111,0.319055,0.5,0.389541
5,channel_15,0.995495,0.497748,0.5,0.498871
6,channel_16,0.999732,0.499866,0.5,0.499933
7,channel_17,0.999716,0.499858,0.5,0.499929
8,channel_18,0.254312,0.127156,0.5,0.20275
9,channel_19,0.980233,0.542278,0.508143,0.511198


In [16]:
from apriori import *

In [17]:
predicted_df['Timestamp'] = list(predicted_df.index)
resampled_recs_data = resampling(predicted_df, "30min")

channel_data_dict = dict()

for index, row in resampled_recs_data.iterrows():
    for column in resampled_recs_data:
        if(column == 'Timestamp'):
            continue
        if column not in channel_data_dict:
            channel_data_dict[column] = []
        if(row[column] > 0.5):
            channel_data_dict[column].append(str(row.Timestamp))
min_date = list(resampled_recs_data['Timestamp'])[0].date()
max_date = list(resampled_recs_data['Timestamp'])[-1].date()

Dates = get_dates_list(min_date, max_date)
Time = get_all_times_of_day("30min")
apriori_dt = data_extractor(channel_list, channel_data_dict, Dates, Time)

NameError: name 'timedelta' is not defined

In [None]:

# Generate frequent itemsets and rules one time slice at a time and save them in a dictionary. 
house = 'House_2'
min_support = 0.02
min_confidence = 0.02
considered_rules = 200
resampling_time_in_min = '30'

time_itemset_map = dict()
time_rules_map = dict()
time_channels_map_from_itemsets = dict()
time_channels_map_from_rules = dict()

time_appliance_map = divide_data_into_time(Time, apriori_dt)

for timestamp in list(time_appliance_map.keys()):
    
    print("Generating Itemsets for : " + str(timestamp))
    # Generate frequent itemsets
    time_itemset_map[timestamp] = get_support_and_itemsets(time_appliance_map[timestamp], min_support)
    
#     print("Generating Rules for : " + str(timestamp))
    # Generate rules
    time_rules_map[timestamp] = association_rules(time_itemset_map[timestamp], metric="confidence", min_threshold = min_confidence)
#     print(time_rules_map[timestamp].shape)
    
#     print("Filtering Rules for : " + str(timestamp))
    # Filter rules which starts from current time slice
    rules_df = time_rules_map[timestamp]
    time_rules_map[timestamp] = rules_df[rules_df['antecedents'] == frozenset({timestamp})]
    
    # Get channels from frequent itemsets
    time_channels_map_from_itemsets[timestamp] = get_channels_from_frequent_itemsets(time_itemset_map[timestamp])
    
    # Get channels from rules
    time_channels_map_from_rules[timestamp] = get_channels_from_rules(time_rules_map[timestamp])

In [213]:
for timestamp, df in time_rules_map.items():
    timestamp= str(timestamp).replace(":","_")


In [214]:
def get_display_names_dict(labels_to_name_file):
    display_names_dict = dict()
    display_names_df = pd.read_csv(
        labels_to_name_file, names=["Labels", "Name"], header=0
    )
    for index, row in display_names_df.iterrows():
        display_names_dict[row.Labels] = row.Name
    return display_names_dict

In [215]:
labels_to_name_file = "Labels_to_name_files/House_" + str(2) + ".csv"
labels_df = pd.read_csv("House_2/" + "labels.dat", sep='\\s+', names=['Channel_id','Appliance'])
labels_df["Channel_id"] = ["channel_"+str(i) for i in range(1,labels_df.shape[0]+1)]
labels_dict = dict()
for row in labels_df.iterrows():
    labels_dict[row[1]["Channel_id"]] = row[1]["Appliance"]
show_name_dict = get_display_names_dict(labels_to_name_file)


In [216]:
recommendation_list = []
for timestamp in Time:
    rule_df = time_rules_map[timestamp].sort_values(by=['confidence'], ascending=False)
    recommended_channels = get_channels_from_rules(rule_df[:considered_rules])
    recommendations = get_appliances_from_channels(recommended_channels, labels_df)
#     recommendations = change_names(recommendations, show_name_dict)
    recommendation_list.append(",".join(appliance for appliance in recommendations))
time_recommendation_df = pd.DataFrame({"Time" : Time, "Recommendations" : recommendation_list})

In [220]:
time_recommendation_df = pd.read_csv('recs.csv', header=0, names=['Time','Recommendations'])


In [221]:
time_recommendation_df

Unnamed: 0,Time,Recommendations
0,00:00:00,"channel_18,channel_14,channel_6,channel_2,chan..."
1,00:30:00,"channel_18,channel_14,channel_6,channel_4,chan..."
2,01:00:00,"channel_18,channel_14,channel_6,channel_4,chan..."
3,01:30:00,"channel_18,channel_14,channel_6,channel_4,chan..."
4,02:00:00,"channel_18,channel_14,channel_6,channel_4,chan..."
5,02:30:00,"channel_18,channel_14,channel_6,channel_4,chan..."
6,03:00:00,"channel_18,channel_14,channel_6,channel_5,chan..."
7,03:30:00,"channel_18,channel_14,channel_6,channel_5,chan..."
8,04:00:00,"channel_18,channel_14,channel_6,channel_5,chan..."
9,04:30:00,"channel_18,channel_14,channel_6,channel_5,chan..."
