In [1]:
import json, os, joblib, sys
import pandas as pd, numpy as np
from pymongo import MongoClient
from glob import glob

sys.path.insert(0, os.path.abspath('../../'))
from utilsbox.misc import normalScaler

In [28]:
mongoclient = MongoClient("localhost", 27017)
database    = mongoclient['PowerTAC2020_RANDPROFILES'] # mongoclient['PowerTAC2020_NDC']
collection1 = database['DistributionTransaction_and_Report_Info'] # database['Distribution Report Information']
collection2 = database['Calendar_Info']
dst = '/home/suraj/Desktop/TSFM/database/'

In [29]:
dataframe1 = list(collection1.find())
dataframe2 = list(collection2.find())
dataframe1 = pd.DataFrame(dataframe1)
dataframe2 = pd.DataFrame(dataframe2)

dataframe = pd.merge(dataframe1, dataframe2, on=['Game_Name', 'Timeslot'], validate='1:1')
assert len(dataframe[dataframe[['Game_Name', 'Timeslot']].duplicated()])==0, 'Duplicates in Dataframe!'

dataframe = dataframe[['Game_Name', 'Date', 'Total_Production', 'Total_Consumption']] # dataframe[['Game Name', 'Date', 'Total Production', 'Total Consumption']]
dataframe.columns = ['simulation', 'timestamp', 'production', 'consumption']
dataframe['netdemand'] = dataframe['consumption'] - dataframe['production']
dataframe = dataframe[['simulation', 'timestamp', 'consumption', 'production', 'netdemand']]
dataframe.columns = ['simulation', 'timestamp', 'netconsumption', 'netproduction', 'netdemand']

In [30]:
tagname = 'RAND'
os.makedirs(f'{dst}/ptacnd', exist_ok=True)
os.makedirs(f'{dst}/ptacnc', exist_ok=True)
os.makedirs(f'{dst}/ptacnp', exist_ok=True)

def outlierremoval(x, valuerange):
    if (x>valuerange[0]) and (x<valuerange[1]):
        y = x
    else:
        y = np.nan
    return y

dataframe['timestamp'] = dataframe['timestamp'].astype('datetime64[h]')
ndrange = (dataframe.loc[:, 'netdemand'].mean()-3*dataframe.loc[:, 'netdemand'].std(), dataframe.loc[:, 'netdemand'].mean()+3*dataframe.loc[:, 'netdemand'].std())
dataframe.loc[:, 'netdemand'] = dataframe.loc[:, 'netdemand'].apply(lambda x:outlierremoval(x, ndrange))
ndrange = (dataframe.loc[:, 'netconsumption'].mean()-3*dataframe.loc[:, 'netconsumption'].std(), dataframe.loc[:, 'netconsumption'].mean()+3*dataframe.loc[:, 'netconsumption'].std())
dataframe.loc[:, 'netconsumption'] = dataframe.loc[:, 'netconsumption'].apply(lambda x:outlierremoval(x, ndrange))
ndrange = (dataframe.loc[:, 'netproduction'].mean()-3*dataframe.loc[:, 'netproduction'].std(), dataframe.loc[:, 'netproduction'].mean()+3*dataframe.loc[:, 'netproduction'].std())
dataframe.loc[:, 'netproduction'] = dataframe.loc[:, 'netproduction'].apply(lambda x:outlierremoval(x, ndrange))

# simulations = sorted(dataframe['simulation'].unique(), key=lambda x: int(x[15:]))
simulations = sorted(dataframe['simulation'].unique())
for simno, sim in enumerate(simulations):
    miniframe = dataframe[dataframe['simulation']==sim].reset_index(drop=True)
    miniframe.set_index('timestamp', inplace=True)
    miniframe = miniframe.resample('H').interpolate()
    miniframe = miniframe.reset_index()
    
    n1 = 24-miniframe.loc[0, 'timestamp'].hour
    n2 = miniframe.loc[len(miniframe)-1, 'timestamp'].hour+1
    
    miniframe = miniframe.loc[n1:len(miniframe)-n2-1]
    ndframe = miniframe[['timestamp', 'netdemand']].copy()
    ncframe = miniframe[['timestamp', 'netconsumption']].copy()
    npframe = miniframe[['timestamp', 'netproduction']].copy()
    
#     ndframe.loc[:, 'netdemand_EWA'] = ndframe.loc[:, 'netdemand'].ewm(alpha = 0.6).mean()
#     ndframe.loc[:, 'netdemand_GWA'] = ndframe.loc[:, 'netdemand'].rolling(window=5, win_type='gaussian', min_periods=1, center=True).mean(std=1)
#     ncframe.loc[:, 'netconsumption_EWA'] = ncframe.loc[:, 'netconsumption'].ewm(alpha = 0.6).mean()
#     ncframe.loc[:, 'netconsumption_GWA'] = ncframe.loc[:, 'netconsumption'].rolling(window=5, win_type='gaussian', min_periods=1, center=True).mean(std=1)
#     npframe.loc[:, 'netproduction_EWA'] = npframe.loc[:, 'netproduction'].ewm(alpha = 0.6).mean()
#     npframe.loc[:, 'netproduction_GWA'] = npframe.loc[:, 'netproduction'].rolling(window=5, win_type='gaussian', min_periods=1, center=True).mean(std=1)

    mode = 'train' if (simno/len(simulations))<0.8 else 'test' 
    ndframe.to_csv(f'{dst}/ptacnd/{tagname}_{sim}_{mode}.csv', index=False)
    ncframe.to_csv(f'{dst}/ptacnc/{tagname}_{sim}_{mode}.csv', index=False)
    npframe.to_csv(f'{dst}/ptacnp/{tagname}_{sim}_{mode}.csv', index=False)
