**Table of contents**<a id='toc0_'></a>    
- [Import packages](#toc1_)    
- [Define read and save path](#toc2_)    
- [Load data](#toc3_)    
- [Training](#toc4_)    
- [Cluster Analysis](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
# !pip install -q folium sktime tslearn validclust tsfresh tsfel autoelbow deeptime
# !pip install -q smac==0.8.0 autocluster

This is the main notebook for modelling

TODO:

1. cluster analysis, what is the common characteristic for clusters
2. ensemble clustering
3. auto clustering
4. better imputation methods
5. other feature extraction method
6. other dim reduction method

# <a id='toc1_'></a>[Import packages](#toc0_)

In [2]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib.colors import to_hex
from importlib import reload

import sys
if Path('/content/drive/MyDrive').exists():
  sys.path.append('/content/drive/MyDrive/Colab Notebooks/custom_modules')
else:
  sys.path.append('./custom_modules')

import TSClustering
reload(TSClustering)
from TSClustering import TSClustering
from basic_funs import read_file_with_stem

# <a id='toc2_'></a>[Define read and save path](#toc0_)

In [3]:
local_path = Path('../data_preprocessed')
drive_path = Path('/content/drive/MyDrive/ProcessedData_Melbourne_Footfalls')

base_path = local_path if local_path.exists() else drive_path

save_dir = Path('../Results') if local_path.exists() else Path('/content/drive/MyDrive/Results_Melbourne_Footfalls')
if save_dir.exists() == False:
  save_dir.mkdir(parents=True, exist_ok=True)

read_processed_dir = base_path / '1. merged_peds_data_hist_curr'
read_raw_dir = Path('./Data (20230918)') if local_path.exists() else Path('/content/drive/MyDrive/Data/Melbourne_Footfalls')

# <a id='toc3_'></a>[Load data](#toc0_)

In [4]:
file_name_to_search = "footfall_merged"
data = read_file_with_stem(read_processed_dir, file_name_to_search)
# data = pd.read_csv(read_processed_dir / 'footfall_merged.csv') # the data should be unpivoted
data.rename(columns={'New_Sensor_Name': 'Sensor_Name'}, inplace=True)
data.head()

Unnamed: 0,Date_Time,Sensor_Name,Hourly_Counts,Location_ID,Installation_Date,Location_Type,Status,Latitude,Longitude,Location,Year,Month,MDate,Day
0,2009-05-01 00:00:00,Bourke Street Mall (North) | Bou292_T [1],53,1,2009-03-24,Outdoor,A,-37.813494,144.965153,"-37.81349441, 144.96515323",2009,5,1,Friday
1,2009-05-01 00:00:00,Bourke Street Mall (South) | Bou283_T [2],52,2,2009-03-30,Outdoor,A,-37.813807,144.965167,"-37.81380668, 144.96516718",2009,5,1,Friday
2,2009-05-01 00:00:00,Collins Place (North) | Col12_T [18],36,18,2009-03-30,Outdoor,A,-37.813449,144.973054,"-37.81344862, 144.97305353",2009,5,1,Friday
3,2009-05-01 00:00:00,Collins Place (South) | Col15_T [17],28,17,2009-03-30,Outdoor,A,-37.813625,144.973236,"-37.81362543, 144.97323591",2009,5,1,Friday
4,2009-05-01 00:00:00,Flinders Street Station Underpass | FliS_T [6],139,6,2009-03-25,Outdoor,A,-37.819117,144.965583,"-37.81911705, 144.96558255",2009,5,1,Friday


the original data is unpivoted

In [5]:
data.shape

(4378809, 14)

In [6]:
# sensor_locations = pd.read_excel(read_raw_dir / 'pedestrian-counting-system-sensor-locations.xlsx')
sensor_locations = pd.read_excel(read_processed_dir / 'sensor_locations_processed.xlsx')
sensor_locations.drop(columns='Sensor_Name', inplace=True)
sensor_locations.rename(columns={'New_Sensor_Name': 'Sensor_Name'}, inplace=True)
sensor_locations.head()

Unnamed: 0,Location_ID,Sensor_Description,Sensor_Name,Installation_Date,Note,Location_Type,Status,Direction_1,Direction_2,Latitude,Longitude,Location
0,24,Spencer St-Collins St (North),Spencer St-Collins St (North) | Col620_T [24],2013-09-02,,Outdoor,A,East,West,-37.81888,144.954492,"-37.81887963, 144.95449198"
1,25,Melbourne Convention Exhibition Centre,Melbourne Convention Exhibition Centre | MCEC_...,2013-08-28,,Outdoor,A,East,West,-37.824018,144.956044,"-37.82401776, 144.95604426"
2,36,Queen St (West),Queen St (West) | Que85_T [36],2015-01-20,"Pushbox Upgrade, 03/08/2023",Outdoor,A,North,South,-37.816525,144.961211,"-37.81652527, 144.96121062"
3,37,Lygon St (East),Lygon St (East) | Lyg260_T [37],2015-02-11,"Pushbox Upgrade, 30/06/2023",Outdoor,A,North,South,-37.803103,144.966715,"-37.80310271, 144.96671451"
4,41,Flinders La-Swanston St (West),Flinders La-Swanston St (West) | Swa31 [41],2017-06-29,,Outdoor,A,North,South,-37.816686,144.966897,"-37.81668634, 144.96689733"


# <a id='toc4_'></a>[Training](#toc0_)

    """
    Parameters:
    - data: by default is unpivot (wide format) hourly footfall data
    - metric: 
      "euclidean", "dtw", "softdtw" or None
    - scale: None or
      "day", 'week', 'month', 'year', 'hour'
      'early_morning', 'morning', 'midday', 'afternoon', 'evening'
      'workday', 'weekend'
    - model: 
      "kmeans", "kshape", "kernelkmeans", "birch", "ensemble"
    - time_span: float, int or list
      "normal" (before 2020), 
      2019 (or other single year), 
      [start_date, end_date] or None
    - normalise: 
      "meanvariance", "minmax" or None
    - feature_extraction: 
      True, False or None
    - dim_reduction: 
      'PCA', 'IPCA' or None
    - "order_of_impute_agg": 
      "impute_agg_norm", "impute_norm_agg", "agg_impute_norm", or "agg_norm_impute"
    """

In [7]:
model_configs = {
  # "metric": 'dtw',
  "random_state": 42
}

configs = {
  "data": data.copy(),
  "target_column": 'Sensor_Name', # target (sensor name)
  "time_column": 'Date_Time', # feature names (timestamp)
  "value_column": 'Hourly_Counts', # value
  # "sensor_locations": sensor_locations.copy(), # sensor location meta data
  "sensor_locations": data[['Sensor_Name', 'Latitude', 'Longitude', 'Location']],
  "save_dir": save_dir,
  "algorithm": 'kmeans',
  "scale": 'week', 
  "order_of_impute_agg_norm": "impute_agg_norm", 
  "time_span": 2019, 
  "feature_extraction": None, 
  "dim_reduction": "PCA", 
  "normalise": "meanvariance", 
  "model_configs": model_configs, 
  "remove_missing_data": True,
  "seed": 42,
  "verbose": False
}

In [8]:
%%capture
if configs['algorithm'] == 'birch':
  # the data has been split into chunks, sensors in each chunk should have same time span 
  # and have less than 50% missing values

  read_path = base_path / '4. final_group'

  # List all files in the directory and sort them based on the start year for processing in order
  # all_files = [f for f in os.listdir(read_path) if f.startswith('grouped_data_')] # pivoted format
  # all_files = sorted(all_files, key=lambda x: int(x.split('_')[2]))

  all_files = [f for f in os.listdir(read_path) if f.startswith('data_')] # wide format
  all_files = sorted(all_files, key=lambda x: int(x.split('_')[1]))

  TSClustering(**configs).online_training(all_files)
else:
  scales = ["day", "week", "month", "hour",
          "early_morning", "morning", "midday", "afternoon", "evening",
          "workday", "weekend"]
  for scale in scales:
    configs['scale'] = scale
    TSClustering(**configs).offline_training()

# <a id='toc5_'></a>[Cluster Analysis](#toc0_)
Are there certain years or sensors that tend to cluster together more often?