In [44]:
import json, os, joblib, sys
import pandas as pd, numpy as np
from pymongo import MongoClient
from glob import glob

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler

import plotly.graph_objects as go
import plotly as ply
import plotly.express as px
from plotly.subplots import make_subplots
import chart_studio.plotly as py
import plotly.figure_factory as ff

sys.path.insert(0, os.path.abspath('../../'))
from utilsbox.misc import normalScaler

def outlierremoval(x, valuerange):
    if (x>valuerange[0]) and (x<valuerange[1]):
        y = x
    else:
        y = np.nan
    return y

In [36]:
mongoclient = MongoClient("localhost", 27017)
database    = mongoclient['PowerTAC2020_RANDPROFILES'] 
collection1 = database['Customer_Usage_Data'] # database['Distribution Report Information']
collection2 = database['Calendar_Info']
dst = '/home/suraj/Desktop/TSFM/database/'

In [37]:
dataframe1 = list(collection1.find())
dataframe2 = list(collection2.find())
dataframe1 = pd.DataFrame(dataframe1)
dataframe1.rename(columns={"Game Name": "Game_Name"}, inplace=True)
dataframe2 = pd.DataFrame(dataframe2)

dataframe = pd.merge(dataframe1, dataframe2, on=['Game_Name', 'Timeslot'], validate='1:1')
assert len(dataframe[dataframe[['Game_Name', 'Timeslot']].duplicated()])==0, 'Duplicates in Dataframe!'

In [38]:
gamecustomers = [name.replace('Usage_Per_Population_','') for name in dataframe.columns.tolist() if "Usage_Per_Population" in name]
customers = [
 'BrooksideHomes',
 'CentervilleHomes',
 'DowntownOffices',
 'EastsideOffices',
 'FrostyStorage',
 'HextraChemical',
 'MedicalCenter-1',
 'OfficeComplex 1 NS Base',
 'OfficeComplex 1 NS Controllable',
 'OfficeComplex 1 SS Base',
 'OfficeComplex 1 SS Controllable',
 'OfficeComplex 2 NS Base',
 'OfficeComplex 2 NS Controllable',
 'OfficeComplex 2 SS Base',
 'OfficeComplex 2 SS Controllable',
 'Village 1 NS Base',
 'Village 1 NS Controllable',
 'Village 1 RaS Base',
 'Village 1 RaS Controllable',
 'Village 1 ReS Base',
 'Village 1 ReS Controllable',
 'Village 1 SS Base',
 'Village 1 SS Controllable',
 'Village 2 NS Base',
 'Village 2 NS Controllable',
 'Village 2 RaS Base',
 'Village 2 RaS Controllable',
 'Village 2 ReS Base',
 'Village 2 ReS Controllable',
 'Village 2 SS Base',
 'Village 2 SS Controllable',
 'fc2',
 'fc3',
 'freezeco-1',
 'freezeco-2',
 'freezeco-3',
 'seafood-1',
 'seafood-2',
 'sf2',
 'sf3']
assert set(gamecustomers) == set(customers), 'Missing Customers'

requiredcolumns = ['Game_Name', 'Timeslot', 'Date', 'Tariff'] + [f'Usage_Per_Population_{name}' for name in customers]
renamecolumns = ['simulation', 'timeslot', 'timestamp', 'tariff'] + [f'ppusage_{name}' for name in customers]
dataframe = dataframe[requiredcolumns]
dataframe.columns = renamecolumns

In [39]:
tagname = 'RAND'
os.makedirs(f'{dst}/ptaccu', exist_ok=True)
dataframe['timestamp'] = dataframe['timestamp'].astype('datetime64[h]')

for name in customers:
    vrange = (dataframe.loc[:, f'ppusage_{name}'].mean()-3*dataframe.loc[:, f'ppusage_{name}'].std(), dataframe.loc[:, f'ppusage_{name}'].mean()+3*dataframe.loc[:, f'ppusage_{name}'].std())
    dataframe.loc[:, f'ppusage_{name}'] = dataframe.loc[:, f'ppusage_{name}'].apply(lambda x:outlierremoval(x, vrange))
    
simulations = sorted(dataframe['simulation'].unique())
for simno, sim in enumerate(simulations):
    miniframe = dataframe[dataframe['simulation']==sim].reset_index(drop=True)
    miniframe.set_index('timestamp', inplace=True)
    miniframe = miniframe.resample('H').interpolate(method='bfill', limit_direction='backward')
    miniframe = miniframe.reset_index()
    
    n1 = 24-miniframe.loc[0, 'timestamp'].hour
    n2 = miniframe.loc[len(miniframe)-1, 'timestamp'].hour+1
    miniframe = miniframe.loc[n1:len(miniframe)-n2-1]
    
#     for name in customers:
#         miniframe.loc[:, f'ppusage_{name}_EWA'] = miniframe.loc[:, f'ppusage_{name}'].ewm(alpha = 0.6).mean()
#         miniframe.loc[:, f'ppusage_{name}_GWA'] = miniframe.loc[:, f'ppusage_{name}'].rolling(window=5, win_type='gaussian', min_periods=1, center=True).mean(std=1)
    
    mode = 'train' if (simno/len(simulations))<0.8 else 'test' 
    miniframe.to_csv(f'{dst}/ptaccu/{tagname}_{sim}_{mode}.csv', index=False)

In [40]:
[f'ppusage_{name}' for name in customers]

['ppusage_BrooksideHomes',
 'ppusage_CentervilleHomes',
 'ppusage_DowntownOffices',
 'ppusage_EastsideOffices',
 'ppusage_FrostyStorage',
 'ppusage_HextraChemical',
 'ppusage_MedicalCenter-1',
 'ppusage_OfficeComplex 1 NS Base',
 'ppusage_OfficeComplex 1 NS Controllable',
 'ppusage_OfficeComplex 1 SS Base',
 'ppusage_OfficeComplex 1 SS Controllable',
 'ppusage_OfficeComplex 2 NS Base',
 'ppusage_OfficeComplex 2 NS Controllable',
 'ppusage_OfficeComplex 2 SS Base',
 'ppusage_OfficeComplex 2 SS Controllable',
 'ppusage_Village 1 NS Base',
 'ppusage_Village 1 NS Controllable',
 'ppusage_Village 1 RaS Base',
 'ppusage_Village 1 RaS Controllable',
 'ppusage_Village 1 ReS Base',
 'ppusage_Village 1 ReS Controllable',
 'ppusage_Village 1 SS Base',
 'ppusage_Village 1 SS Controllable',
 'ppusage_Village 2 NS Base',
 'ppusage_Village 2 NS Controllable',
 'ppusage_Village 2 RaS Base',
 'ppusage_Village 2 RaS Controllable',
 'ppusage_Village 2 ReS Base',
 'ppusage_Village 2 ReS Controllable',
 'p

In [21]:
filecustomers = [
 'ppusage_BrooksideHomes',
 'ppusage_CentervilleHomes',
 'ppusage_DowntownOffices',
 'ppusage_EastsideOffices',
 'ppusage_FrostyStorage',
 'ppusage_HextraChemical',
 'ppusage_MedicalCenter-1',
 'ppusage_OfficeComplex 1 NS Base',
 'ppusage_OfficeComplex 1 NS Controllable',
 'ppusage_OfficeComplex 1 SS Base',
 'ppusage_OfficeComplex 1 SS Controllable',
 'ppusage_OfficeComplex 2 NS Base',
 'ppusage_OfficeComplex 2 NS Controllable',
 'ppusage_OfficeComplex 2 SS Base',
 'ppusage_OfficeComplex 2 SS Controllable',
 'ppusage_Village 1 NS Base',
 'ppusage_Village 1 NS Controllable',
 'ppusage_Village 1 RaS Base',
 'ppusage_Village 1 RaS Controllable',
 'ppusage_Village 1 ReS Base',
 'ppusage_Village 1 ReS Controllable',
 'ppusage_Village 1 SS Base',
 'ppusage_Village 1 SS Controllable',
 'ppusage_Village 2 NS Base',
 'ppusage_Village 2 NS Controllable',
 'ppusage_Village 2 RaS Base',
 'ppusage_Village 2 RaS Controllable',
 'ppusage_Village 2 ReS Base',
 'ppusage_Village 2 ReS Controllable',
 'ppusage_Village 2 SS Base',
 'ppusage_Village 2 SS Controllable',
 'ppusage_fc2',
 'ppusage_fc3',
 'ppusage_freezeco-1',
 'ppusage_freezeco-2',
 'ppusage_freezeco-3',
 'ppusage_seafood-1',
 'ppusage_seafood-2',
 'ppusage_sf2',
 'ppusage_sf3'
]

In [55]:
files = glob(f"/home/suraj/Desktop/TSFM/database/ptaccu/*_train.csv")
files = [file for file in files if 'RAND' not in file]

book = {name:[] for name in filecustomers}
for file in files:
    df = pd.read_csv(file, parse_dates=['timestamp'])
    dates = df['timestamp'].dt.date.unique()
    np.random.shuffle(dates)
    
    checkdates = dates[0:5]
    df = df[df['timestamp'].dt.date.isin(checkdates)].reset_index(drop=True)
    
    for name in filecustomers:
        for date in checkdates:
            timeseries = df.loc[df['timestamp'].dt.date==date, name].to_numpy()
            book[name].append(timeseries)
    
for idno, name in enumerate(filecustomers):
    book[name] = np.expand_dims(np.stack(book[name], axis=0), axis=2)
    book[name] = ( TimeSeriesScalerMeanVariance().fit_transform(book[name]), idno*np.ones(shape=(len(book[name]),1)) )

X, Y = [], []
for name in filecustomers:
    X.append(book[name][0])
    Y.append(book[name][1])
X = np.concatenate(X, axis=0)
Y = np.concatenate(Y, axis=0)
print(X.shape, Y.shape)
    
model = TimeSeriesKMeans(n_clusters=4, metric="softdtw", max_iter=20, metric_params={"gamma": .01}, verbose=True)
C = model.fit_predict(X)

(57600, 24, 1) (57600, 1)
100.609 --> 52.707 --> 50.817 --> 50.449 --> 49.810 --> 50.256 --> 50.966 --> 51.553 --> 51.737 --> 51.561 --> 51.646 --> 51.701 --> 51.777 --> 51.773 --> 51.793 --> 51.432 --> 51.335 --> 51.317 --> 51.287 --> 51.289 --> 


In [56]:
classnames = [filecustomers[int(value.item())] for value in Y]
clusternames = [f"cluster_{value}" for value in C]
dataframe = pd.DataFrame.from_records({'customers': classnames, 'clustername': clusternames})

fig = go.Figure()
for clustername in dataframe['clustername'].unique():
    df = dataframe.loc[dataframe['clustername']==clustername, ['customers']].reset_index(drop=True)
    df['count'] = 1
    df = df.groupby(by=['customers']).sum().reset_index()
    df = pd.merge(df, pd.DataFrame.from_records({'customers':filecustomers}), on=['customers'], how='right', validate='1:1')
    df.fillna(0, inplace=True)
    df.sort_values(by=['customers'], key=lambda x: x.map({name:idno for idno,name in enumerate(filecustomers)}), inplace=True)

    fig.add_trace( go.Bar(x=df['customers'], y=df['count'], text=df['count'], textposition='outside', visible=True, name=clustername) )

fig.update_layout(title=f"PowerTAC Customers Clustering", xaxis_title="Customers", yaxis_title="Cluster Selection")
fig.show()

In [57]:
with open(f'cc.html', 'w') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))