In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
data_files_path = '/content/gdrive/MyDrive/Data Files/Release Data'
!ls '/content/gdrive/MyDrive/Data Files/Release Data'

In [None]:
Location = 'USA-2'
harmonics_data = data_files_path + f'/Harmonics Data/{Location}/{Location}_harmonics.csv'
power_data = data_files_path + f'/Power consumption data/{Location}/{Location}.csv'

In [None]:
from math import ceil
import math
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
import glob
import re
from collections import OrderedDict
import os

from numpy.lib.function_base import average
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

import matplotlib.lines as mlines
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")
calendar.setfirstweekday(6)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})

palette = {
    '0': list(sns.color_palette())[0],
    '1': list(sns.color_palette())[1],
    '2': list(sns.color_palette())[2],
    '3': list(sns.color_palette())[3],
    '4': list(sns.color_palette())[4],
    '5': list(sns.color_palette())[5],
}


In [None]:
def preprocessing_data( path ):

    file_path = Path(path)
    print(file_path)
    df = pd.read_csv(file_path)

    df = df.rename(columns={'Unnamed: 0': 'datetime'})
    df['datetime'] = pd.to_datetime(df['datetime']) 
    df = df.set_index('datetime')
    df = df.tz_convert('US/Eastern')
    df = df.resample('1min').mean()

    return df

In [None]:
def pca_function(df , components):

  if 'ActivePT' in df.columns:
    feature_array = df.drop( ['ActivePT'], axis=1 ).values
  else:
    feature_array = df.values

  pca = PCA(n_components=components)
  principalComponents = pca.fit_transform(feature_array)
  principalDf = pd.DataFrame(data = principalComponents, columns = ['components_'+str(i+1) for i in range(components) ] )
  principalDf.index = df.index

  return principalDf

In [None]:
df_power = preprocessing_data(power_data)
df_harmonics = preprocessing_data(harmonics_data)

In [None]:
# Filtering the required features
df_power = df_power[['ActivePT']]
df_harmonics = df_harmonics[[ "AI_HR3", "AI_HR5", "AI_HR7", "AI_HR9", 'AI_HR11', 'AI_HR13', 'AI_HR15', 'AI_HR17', 'AI_HR19', 'AI_HR21', 'AI_HR23', 'AI_HR25', 'AI_HR27', 'AI_HR29', 'AI_HR31',
                              "BI_HR3", "BI_HR5", "BI_HR7", "BI_HR9", 'BI_HR11', 'BI_HR13', 'BI_HR15', 'BI_HR17', 'BI_HR19', 'BI_HR21', 'BI_HR23', 'BI_HR25', 'BI_HR27', 'BI_HR29', 'BI_HR31', 
                              "CI_HR3", "CI_HR5", "CI_HR7", "CI_HR9", 'CI_HR11', 'CI_HR13', 'CI_HR15', 'CI_HR17', 'CI_HR19', 'CI_HR21', 'CI_HR23', 'CI_HR25', 'CI_HR27', 'CI_HR29', 'CI_HR31',
                            ]]

In [None]:
df_merged = pd.merge( df_harmonics, df_power, left_index=True, right_index=True)
round(df_merged.isna().sum().sum()/(len(df_merged)*len(df_merged.columns)),4)*100

In [None]:
#Functions to replace nan values - Functions for Interpolation

def search_past_dates( data, timestamp, col ):
  # print('Searching for values in past dates to ',timestamp)
  start_timestamp = data.index[0]
  end_timestamp = timestamp - timedelta(days=1)

  time_value = timestamp.strftime('%H:%M:%S')

  start_date = start_timestamp.date()
  end_date = end_timestamp.date()

  if start_date > end_date:
    return False

  period = (end_date - start_date).days + 1

  for dt in pd.date_range( start_date, periods = period )[::-1]:
    timestamp_str = str(dt.date()) +' ' + str(time_value)
    timestamp_var = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S' )

    if timestamp_str < start_timestamp.strftime( '%Y-%m-%d %H:%M:%S' ):
      break

    if timestamp_var not in data.index:
      continue

    value = data.loc[timestamp_str][col]

    if math.isnan( value ) == False:
      return timestamp_str

  return False


def search_future_dates( data, timestamp, col ):
  # print('Searching for values in future dates to ',timestamp)
  start_timestamp = timestamp + timedelta(days=1)
  end_timestamp = data.index[-1]

  time_value = timestamp.strftime('%H:%M:%S')

  start_date = start_timestamp.date()
  end_date = end_timestamp.date()

  if start_date > end_date:
    return False

  period = (end_date - start_date).days + 1

  for dt in pd.date_range( start_date, periods = period ):
    timestamp_str = str(dt.date()) +' ' + str(time_value)
    timestamp_var = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S' )

    if timestamp_str > end_timestamp.strftime( '%Y-%m-%d %H:%M:%S' ):
      break

    if timestamp_var not in data.index:
      continue

    value = data.loc[timestamp_str][col]

    if math.isnan( value ) == False:
      return timestamp_str

  return False
    
# Fill the missing data in the dataframe - Interpolting the dataframe

def interpolate_dataframe(df):
  curr_month = 0
  for idx in df.index:

    if idx.date().month != curr_month:
      curr_month = idx.date().month
      # print(idx.date())

    if df.loc[idx].isnull().values.any() == True:

      if math.isnan( df.loc[idx]['ActivePT'] ) == False:
        past_idx = search_past_dates( df, idx, 'AI_HR3' )
        future_idx = search_future_dates( df, idx, 'AI_HR3' )

        # print(idx, past_idx, future_idx)

        if past_idx == False:
          df.loc[idx, df.columns!='ActivePT'] = df.loc[future_idx, df.columns!='ActivePT']
          continue

        if future_idx == False:
          df.loc[idx, df.columns!='ActivePT'] = df.loc[past_idx, df.columns!='ActivePT']
          continue

        df.loc[idx, df.columns!='ActivePT'] = (df.loc[past_idx, df.columns!='ActivePT'] + df.loc[future_idx, df.columns!='ActivePT'])/2

      if math.isnan( df.loc[idx]['ActivePT'] ) == True:
        past_idx = search_past_dates( df, idx, 'ActivePT' )
        future_idx = search_future_dates( df, idx, 'ActivePT' )

        # print(idx, past_idx, future_idx)

        if past_idx == False:
          df.loc[idx]['ActivePT'] = df.loc[future_idx]['ActivePT']
          continue

        if future_idx == False:
          df.loc[idx]['ActivePT'] = df.loc[past_idx]['ActivePT']
          continue

        df.loc[idx] = (df.loc[past_idx] + df.loc[future_idx])/2

  # if df.isnull().values.any():
  #   df = df.fillna( df.mean() )

  return df
  # print( df.isnull().values.any())

In [None]:
df_merged_filled = interpolate_dataframe(df_merged.copy())
round(df_merged_filled.isna().sum().sum()/(len(df_merged_filled)*len(df_merged_filled.columns)),4)*100

In [None]:
if df_merged_filled.isnull().values.any():
  df_merged_filled = df_merged_filled.fillna( df_merged_filled.mean() )
df_merged_filled.isnull().values.any(), round(df_merged_filled.isna().sum().sum()/(len(df_merged_filled)*len(df_merged_filled.columns)),4)*100

In [None]:
df_merged.equals(df_merged_filled)

In [None]:

def cluster_kmeans(data, n_clusters, return_centers=False):

  model = KMeans(n_clusters=n_clusters, random_state=100000).fit(data)
  centers = model.cluster_centers_
  X_labels = model.labels_

  if return_centers == True:
    return [str(i) for i in X_labels], centers
  else:
    return [str(i) for i in X_labels], model.inertia_

In [None]:

#Training the Classification model
#Takes in a List of Dataframes or a single dataframe to train the model

def train_cluster_classification_model(  data, number_of_clusters ):

  if 'ActivePT' in data.columns:
    flag = 1
  else:
    flag = 0

  labels = []
  centers = []
  if flag == 1:
    X = np.array(data.drop(['ActivePT'], axis=1))
  else:
    X = np.array(data)

  labels, _ = cluster_kmeans( X, number_of_clusters )

  data['cluster_labels'] = labels

  if flag == 1:
    train_X = data.drop(['ActivePT','cluster_labels'], axis=1)
    train_y = data['cluster_labels']
  else:
    train_X = data.drop(['cluster_labels'], axis=1)
    train_y = data['cluster_labels']

  model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
  model.fit(train_X,train_y)

  return model


In [None]:
# #Saving the Model
# import pickle

# filename = 'january_classification_model.sav'
# pickle.dump(model, open(data_files_path + "/" + filename, 'wb'))

In [None]:
def get_week_of_month(year, month, day):
    x = np.array(calendar.monthcalendar(year, month))
    week_of_month = np.where(x==day)[0][0] + 1
    return week_of_month


def get_f1_score( data, model, num_clusters ):
  prediction_labels = model.predict(data)
  kmeans_labels, _ = cluster_kmeans(np.array(data), num_clusters)

  if len(set(kmeans_labels)) > 1:
    cluster_silScore  = metrics.silhouette_score(np.array(data), kmeans_labels, metric='euclidean')
  else:
    cluster_silScore = 0

    
  f1_score = metrics.f1_score(kmeans_labels, prediction_labels, average='micro' )

  return f1_score, cluster_silScore

In [None]:
# Training on a week and Testing on week

def predict_multiple_weeks( data, model, num_clusters ):

  if 'ActivePT' in data.columns:
    data = data.drop(['ActivePT'], axis=1)

  # fixing the start date to Monday
  start_date = data.index[0].date()
  while( start_date.isoweekday() != 1 ):
    start_date += timedelta(days=1)

  # fixing the end date to Saturday
  end_date = data.index[-1].date()
  while( end_date.isoweekday() != 6 ):
    end_date -= timedelta(days=1)

  period = (end_date - start_date).days + 1
  
  week_list = []
  f1_score_list = []
  week_start_date = []
  week_end_date = []
  month_list = []

  flag = 0
  for dt in pd.date_range( start_date, periods = period ):

    if flag == 0:
      week_start = dt.date()
      flag = 1

    if dt.date().isoweekday() == 7 and flag == 1:
      week_end = dt.date()
      flag = 0
      if week_start.month == week_end.month:
        f1_score, clus_sil = get_f1_score( data.loc[str(week_start):str(week_end)], model, num_clusters)

        f1_score, clus_sil = round(f1_score,4), round(clus_sil,4)

        week_number = get_week_of_month(week_start.year, week_start.month, week_start.day)

        week_list.append( str(week_start.strftime("%B"))[:3] + '-week-' + str(week_number) )
        week_start_date.append( str(week_start) )
        week_end_date.append( str(week_end) )
        f1_score_list.append(f1_score)
        month_list.append(week_start.strftime("%B"))

  f1_score_df = pd.DataFrame( data={'Month_week': [], 'week_start_date': [], 'week_end_date': [], 'f1_score': [],
                                    'Clustering Silhouette Score': [], 'month': []} )

  f1_score_df['Month_week'] = week_list
  f1_score_df['week_start_date'] = week_start_date
  f1_score_df['week_end_date'] = week_end_date
  f1_score_df['f1_score'] = f1_score_list
  f1_score_df['Clustering Silhouette Score'] = clus_sil
  f1_score_df['month'] = month_list

  return f1_score_df


In [None]:
# Training on mondays of a month and Testing on mondaya of other months

def predict_multiple_days( data, model, num_clusters, day_num ):

  if 'ActivePT' in data.columns:
    data = data.drop(['ActivePT'], axis=1)

  start_date = data.index[0].date()
  end_date = data.index[-1].date()

  period = (end_date - start_date).days + 1

  month_week_list = []
  date_list = []
  f1_score_list = []
  clus_sil_list = []
  month_list = []

  for dt in pd.date_range( start_date, periods = period ):

    curr_date = dt.date()
  
    if curr_date.isoweekday() == day_num:
      
      f1_score, clus_sil = get_f1_score( data.loc[str(curr_date)], model, num_clusters )

      f1_score, clus_sil = round(f1_score,4), round(clus_sil,4)       

      week_number = get_week_of_month( curr_date.year, curr_date.month, curr_date.day )

      month_week_list.append( curr_date.strftime("%B")[:3]+'-week-'+str(week_number) )
      date_list.append(str(curr_date))
      f1_score_list.append(f1_score)
      clus_sil_list.append(clus_sil)
      month_list.append(curr_date.strftime("%B"))

  f1_score_df = pd.DataFrame( data={'Month_week': [], 'date': [], 'f1_score': [], 
                                    'Clustering Silhouette Score': [], 'month': []} )

  f1_score_df['Month_week'] = month_week_list
  f1_score_df['date'] = date_list
  f1_score_df['f1_score'] = f1_score_list
  f1_score_df['Clustering Silhouette Score'] = clus_sil_list
  f1_score_df['month'] = month_list

  return f1_score_df


In [None]:
def train_on_multiple_weeks( data , num_clusters, start_week_Monday = True ):

  if 'ActivePT' in data.columns:
    data = data.drop(['ActivePT'], axis=1)

  if start_week_Monday == True:
    # fixing the start date to Monday
    start_date = data.index[0].date()
    while( start_date.isoweekday() != 1 ):
      start_date += timedelta(days=1)

    # fixing the end date to Saturday
    end_date = data.index[-1].date()
    while( end_date.isoweekday() != 7 ):
      end_date -= timedelta(days=1)

    period = (end_date - start_date).days + 1

    df_list = []

    flag = 0
    for dt in pd.date_range( start_date, periods = period ):

      if flag == 0:
        week_start = dt.date()
        flag = 1

      if dt.date().isoweekday() == 7 and flag == 1:
        print(dt.date())
        week_end = dt.date()
        flag = 0
        if week_start.month == week_end.month:
          print(week_start, week_end)
          df_list.append( data.loc[ str(week_start) : str(week_end) ] )

    if len(df_list) == 0:
      return None

    train_df = pd.concat(df_list)
  else:
    train_df = data

  model = train_cluster_classification_model( train_df, num_clusters )

  return model


In [None]:
def train_on_multiple_days( data, number_of_clusters, day_num ):

  if 'ActivePT' in data.columns:
    data = data.drop(['ActivePT'], axis=1)

  flag = 0
  start_date = data.index[0].date()
  end_date = data.index[-1].date()

  period = (end_date - start_date).days + 1

  df_list = []
  flag = 0
  for dt in pd.date_range( start_date, periods = period ):
    curr_date = dt.date()

    if curr_date.isoweekday() == day_num:
      if flag == 0:
        print("Training on",curr_date.strftime("%B"),curr_date.strftime("%A")+"s" )
        flag = 1
      df_list.append( data.loc[str(curr_date)] )

  train_df = pd.concat(df_list)
  model = train_cluster_classification_model( train_df, number_of_clusters )
  return model

In [None]:
def week_train( data, num_clusters, test_on = 'week', day = 1, all_weeks = False, save_graphs = True ):

  days=["Monday", "Tuesday", "Wednesday" ,"Thursday", "Friday", "Saturday", "Sunday"]

  if test_on != 'week':
    print('Testing on',days[day-1]+'s')

  fig,ax = plt.subplots(figsize=(15,3))
  ax.set(ylim=(0, 1))

  if all_weeks == False:
    if test_on == 'week':
      folder_name = 'train_one_week/test_one_week'
    else:
      folder_name = 'train_one_week/test_'+days[day-1]
  else:
    if test_on == 'week':
      folder_name = 'train_all_weeks/test_one_week'
    else:
      folder_name = 'train_all_weeks/test_'+days[day-1]


  if os.path.exists( data_files_path + '/results/' + folder_name ) == False:
    os.makedirs( data_files_path + '/results/' + folder_name )

  month_palette = {
      'January': list(sns.color_palette())[0],
      'February': list(sns.color_palette())[1],
      'March': list(sns.color_palette())[2],
      'April': list(sns.color_palette())[3],
      'May': list(sns.color_palette())[4],
      'June': list(sns.color_palette())[5],
  }

  markers = {
      'January': 'o',
      'February': 'x',
      'March': '^',
      'April': '+',
      'May': 's',
      'June': 'D',
  } #['o', 'x', '^', '+', '*', '8', 's', 'p', 'D', 'V']

  training_weeks = {
      'January': ['2022-01-10','2022-01-16'],
      'February': ['2022-02-14','2022-02-20'],
      'March': ['2022-03-07','2022-03-13'],
      'April': ['2022-04-11','2022-04-17'],
      'May': ['2022-05-16','2022-05-22'],
      'June': ['2022-06-06','2022-06-12'],
  }

  marker_plot = []

  for month in sorted(list(set([str(i) for i in data.index.to_period('M')]))):
    month_name = datetime.strptime(month, '%Y-%m').strftime("%B")
    print("Genrating Scores for", month_name)

    if all_weeks == False:
      model = train_cluster_classification_model( data.loc[ training_weeks[month_name][0]:training_weeks[month_name][1] ], num_clusters )
    else:
      model = train_on_multiple_weeks( data.loc[month], num_clusters )


    if test_on == 'week':
      f1_score_df = predict_multiple_weeks( data.loc[month:], model, num_clusters )
    else:
      f1_score_df = predict_multiple_days( data.loc[month:], model, num_clusters, day )

    with open( os.path.join(data_files_path,'results',folder_name,month_name+'.csv'), 'w', encoding = 'utf-8-sig') as f:
      f1_score_df.to_csv(f)

    if save_graphs == True:
      fig2,ax2 = plt.subplots(figsize=(15,3))
      ax2.set(ylim=(0, 1))
      sns.lineplot( data=f1_score_df, x='Month_week', y='f1_score', c='black', alpha=0.1, ax=ax2 )
      sns.scatterplot( data=f1_score_df, x='Month_week', y='f1_score', s=50, ax=ax2)
      if all_weeks == False and test_on ==  'week':
        sns.scatterplot( data=f1_score_df.loc[ f1_score_df['week_start_date'] == training_weeks[month_name][0] ], 
                        x='Month_week', y='f1_score', color='none', edgecolor='red', s=150, ax=ax2 )
      plt.xticks(rotation=45)

      if all_weeks == False:
        ax2.set_title("F1 Scores using Trained Model on one week of "+month_name+" Testing on "+days[day-1]).set_fontsize(15)
      else:
        ax2.set_title("F1 Scores using Trained Model on all week of "+month_name+" Testing on "+days[day-1]).set_fontsize(15)

      if save_graphs == True:
        fig2.savefig(os.path.join(data_files_path,'results',folder_name,month_name+'.png'), bbox_inches='tight', dpi=800)
      fig2.clf()
      plt.close(fig2)

    sns.lineplot( data=f1_score_df, x='Month_week', y='f1_score', c='black', alpha=0.1, ax=ax )
    sns.scatterplot( data=f1_score_df, x='Month_week', y='f1_score', s=50, marker = markers[month_name], ax=ax)
    if all_weeks == False and test_on ==  'week':
      sns.scatterplot( data=f1_score_df.loc[ f1_score_df['week_start_date'] == training_weeks[month_name][0] ],
                      x='Month_week', y='f1_score', color='none', edgecolor='red', s=150, ax=ax )

    plt.xticks(rotation=45)
    marker_plot.append( mlines.Line2D([], [], color=month_palette[month_name], marker=markers[month_name], linestyle='None',markersize=10, label=month_name ))
    

  ax.legend(handles=marker_plot, bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0)
  if all_weeks == False:
    if test_on == 'week':
      ax.set_title("F1 Scores using Trained Model on one week Testing on one week").set_fontsize(15)
    else:
      ax.set_title("F1 Scores using Trained Model on one week Testing on "+days[day-1]).set_fontsize(15)
  else:
    if test_on == 'week':
      ax.set_title("F1 Scores using Trained Model on all weeks Testing on one week").set_fontsize(15)
    else:
      ax.set_title("F1 Scores using Trained Model on all weeks Testing on "+days[day-1]).set_fontsize(15)
  if save_graphs == True:
    fig.savefig(os.path.join(data_files_path,'results',folder_name,'all_months.png'), bbox_inches='tight', dpi=800)
  fig.clf()
  plt.close(fig)

  del f1_score_df
  del data

In [None]:
def day_train( data, num_clusters, train_day = 1, test_day = None, save_graphs = True ):

  if test_day == None:
    test_day = train_day

  days=["Monday", "Tuesday", "Wednesday" ,"Thursday", "Friday", "Saturday", "Sunday"]

  print('Testing on',days[train_day-1]+'s')

  fig,ax = plt.subplots(figsize=(15,3))
  ax.set(ylim=(0, 1))

  train_folder_name = 'train_'+days[train_day-1]
  test_folder_name = 'test_'+days[test_day-1]

  if os.path.exists( os.path.join( data_files_path,'results', train_folder_name, test_folder_name ) ) == False:
    os.makedirs( os.path.join( data_files_path,'results', train_folder_name, test_folder_name ) )

  month_palette = {
      'January': list(sns.color_palette())[0],
      'February': list(sns.color_palette())[1],
      'March': list(sns.color_palette())[2],
      'April': list(sns.color_palette())[3],
      'May': list(sns.color_palette())[4],
      'June': list(sns.color_palette())[5],
  }

  markers = {
      'January': 'o',
      'February': 'x',
      'March': '^',
      'April': '+',
      'May': 's',
      'June': 'D',
  } #['o', 'x', '^', '+', '*', '8', 's', 'p', 'D', 'V']

  marker_plot = []

  for month in sorted(list(set([str(i) for i in data.index.to_period('M')]))):
    month_name = datetime.strptime(month, '%Y-%m').strftime("%B")

    print("Genrating Scores for", month_name)
    model = train_on_multiple_days( data.loc[month], num_clusters, train_day ) #(1 for Monday, #7 for Sundays)
    f1_score_df = predict_multiple_days( data.loc[month:], model, num_clusters, test_day ) #(1 for Monday, #7 for Sundays)

    with open( os.path.join( data_files_path,'results', train_folder_name, test_folder_name, month_name +'.csv' ), 'w', encoding = 'utf-8-sig') as f:
      f1_score_df.to_csv(f)

    fig2,ax2 = plt.subplots(figsize=(15,3))
    sns.lineplot( data=f1_score_df, x='Month_week', y='f1_score', c='black', alpha=0.1, ax=ax2 )
    sns.scatterplot( data=f1_score_df, x='Month_week', y='f1_score', s=50, palette = month_palette, ax=ax2)
    plt.xticks(rotation=45)

    # ax2.legend(bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0)
    ax2.set_title(f"F1 Scores using Trained Model on {month_name} {days[train_day-1]}s Testing on {days[test_day-1]}").set_fontsize(15)
    
    if save_graphs == True:
      fig2.savefig( os.path.join( data_files_path,'results', train_folder_name, test_folder_name, month_name +'.png' ), bbox_inches='tight', dpi=1200)
    fig2.clf()
    plt.close(fig2)


    sns.lineplot( data=f1_score_df, x='Month_week', y='f1_score', c='black', alpha=0.1, ax=ax )
    sns.scatterplot( data=f1_score_df, x='Month_week', y='f1_score', s=50, palette = month_palette, marker = markers[month_name], ax=ax)
    marker_plot.append( mlines.Line2D([], [], color=month_palette[month_name], marker=markers[month_name], linestyle='None',markersize=10, label=month_name ))
    plt.xticks(rotation=45)

  ax.legend(handles=marker_plot, bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0)
  ax.set_title(f"F1 Scores using Trained Model on {days[train_day-1]} Testing on {days[test_day-1]}").set_fontsize(15)
  
  if save_graphs == True:
    fig.savefig(os.path.join( data_files_path,'results', train_folder_name, test_folder_name, 'all_months' +'.png' ), bbox_inches='tight', dpi=1200)
  fig.clf()
  plt.close(fig)
  
  del f1_score_df
  del data

In [None]:
dates_in_dataframe = set([])
for i in df_merged_filled.index:
  if i.date() not in dates_in_dataframe:
    dates_in_dataframe.add(i.date())
dates_in_dataframe = sorted(list(dates_in_dataframe))

In [None]:
df_merged_filled.index[0], df_merged_filled.index[-1]

In [None]:
train_start = f'{dates_in_dataframe[0].year}-{dates_in_dataframe[0].month}-{dates_in_dataframe[0].day}'
train_end = f'{dates_in_dataframe[6].year}-{dates_in_dataframe[6].month}-{dates_in_dataframe[6].day}'

In [None]:
model = train_on_multiple_weeks(df_merged_filled.loc[train_start:train_end], num_clusters=6, start_week_Monday=False)

In [None]:
for date_item in dates_in_dataframe[7:]:
  date = f'{date_item.year}-{date_item.month}-{date_item.day}'
  test_df = df_merged_filled.loc[date]
  prediction_labels = model.predict(test_df.drop(['ActivePT'], axis=1))
  fig, ax = plt.subplots(2,1, figsize=(12,5))


  principalDf = pca_function(test_df, 2)

  sns.scatterplot(data=test_df, x=test_df.index, y='ActivePT', hue=prediction_labels, palette = palette, ax=ax[0])
  sns.scatterplot( data = principalDf, x="components_1", y="components_2", hue=prediction_labels, palette=palette,ax = ax[1])


  # prediction_silScore  = metrics.silhouette_score(np.array(test_df.drop(['ActivePT'], axis=1)), prediction_labels, metric='euclidean')
  # cluster_silScore  = metrics.silhouette_score(np.array(test_df.drop(['ActivePT'], axis=1)), test_df['labels_KMEANS'], metric='euclidean')
  # f1_score = metrics.f1_score(test_df['labels_KMEANS'], prediction_labels, average='micro' )

  ax[0].legend(bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0)
  ax[1].legend(bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0)

  ax[0].set_title('(a) Cluster with Active Power on Y-Axis').set_fontsize(15)
  ax[0].set_xlabel('datetime')
  ax[1].set_title('(b) PCA ').set_fontsize(15)

  fig.tight_layout()
  fig.savefig(f'{Location}_{date}')


In [None]:
# %rm -rf figs

In [None]:
# !mkdir figs

In [None]:
# !mv *.png figs/

In [None]:
# !zip -r figs.zip figs/