In [1]:
# % matplotlib inline
import logging as log
from time import strftime
from copy import deepcopy
from utils.data_processing import *
from logger.logger import setup_logging
from utils.configs import BaseConf
from dataloaders.grid_loader import GridDataLoaders
from datasets.grid_dataset import GridDataGroup
from utils.utils import get_data_sub_paths, by_ref
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
import os

os.environ['NUMEXPR_MAX_THREADS'] = str(os.cpu_count())

In [3]:
import pandas as pd
import numpy as np
from pprint import pprint

pd.set_option('mode.chained_assignment', None)

In [4]:
data_sub_paths = get_data_sub_paths()
pprint(sorted(data_sub_paths))

['T168H-X1275M-Y1320M_2001-01-04_2019-01-03_#1f1',
 'T1H-X2550M-Y2640M_2014-01-01_2019-01-01_#7ce',
 'T24H-X1275M-Y1320M_2012-01-01_2019-01-01_#939',
 'T24H-X1700M-Y1760M_2014-01-01_2019-01-01_#828',
 'T24H-X255M-Y220M_2012-01-01_2019-01-01_#c97',
 'T24H-X425M-Y440M_2012-01-01_2019-01-01_#827',
 'T24H-X850M-Y880M_2012-01-01_2019-01-01_#826',
 'Totals_T168H_2001-01-04_2019-01-10_LAT41.641_42.024_LON-87.821_-87.525',
 'Totals_T1H_2014-01-01_2019-01-01_LAT41.641_42.024_LON-87.821_-87.525',
 'Totals_T24H_2001-01-01_2019-01-02_LAT41.641_42.024_LON-87.821_-87.525']


In [5]:
data_sub_path = by_ref("7ce")[0]
print(f"using: {data_sub_path}")

using: T1H-X2550M-Y2640M_2014-01-01_2019-01-01_#7ce


In [6]:
conf = BaseConf()
conf.model_name = "test"  # needs to be created

conf.data_path = f"./data/processed/{data_sub_path}/"

if not os.path.exists(conf.data_path):
    raise Exception(f"Directory ({conf.data_path}) needs to exist.")

conf.model_path = f"{conf.data_path}models/{conf.model_name}/"
os.makedirs(conf.data_path, exist_ok=True)
os.makedirs(conf.model_path, exist_ok=True)

# logging config is set globally thus we only need to call this in this file
# imported function logs will follow the configuration
setup_logging(save_dir=conf.model_path, log_config='./logger/standard_logger_config.json', default_level=log.INFO)
log.info("=====================================BEGIN=====================================")

info = deepcopy(conf.__dict__)
info["start_time"] = strftime("%Y-%m-%dT%H:%M:%S")

# DATA LOADER SETUP
np.random.seed(conf.seed)
use_cuda = False  # torch.cuda.is_available()
torch.manual_seed(conf.seed)
if use_cuda:
    torch.cuda.manual_seed(conf.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0" if use_cuda else "cpu")
log.info(f"Device: {device}")
info["device"] = device.type
conf.device = device

2021-09-18T16:58:16 | root | INFO | Device: cpu


In [7]:
conf.batch_size = 128

conf.cap_crime_percentile = 0  # 99.95

# CRIME DATA
data_group = GridDataGroup(data_path=conf.data_path,
                           conf=conf)

loaders = GridDataLoaders(data_group=data_group,
                          conf=conf)

2021-09-18T16:58:27 | root | INFO | Initialising Grid Data Group
2021-09-18T16:58:27 | root | INFO | 	t_range: (43825,) 2014-01-01 00:00:00 -> 2019-01-01 00:00:00
2021-09-18T16:58:27 | root | INFO | 	target_len:	43320	(100.000%)
2021-09-18T16:58:27 | root | INFO | 	trn_val_size:	34680	(80.055%)
2021-09-18T16:58:27 | root | INFO | 	trn_size:	26010	(60.042%)
2021-09-18T16:58:27 | root | INFO | 	val_size:	8670	(20.014%)
2021-09-18T16:58:27 | root | INFO | 	tst_size:	8640 	(19.945%)


In [8]:
grids = data_group.to_counts(sparse_data=data_group.crimes)

In [9]:
from utils.interactive import new_interactive_heatmap, State, InteractiveHeatmapsWithLines
from ipywidgets import Layout, widgets

ModuleNotFoundError: No module named 'geopy'

In [10]:
# from utils.interactive import InteractiveHeatmaps

InteractiveHeatmapsWithLines(
    date_range=data_group.t_range,
    col_wrap=1,
    Counts=grids[:, 0],  #data_group.crimes[:,0],
).app

NameError: name 'InteractiveHeatmapsWithLines' is not defined

In [11]:
if data_group.t_range.freqstr == 'H':
    temporal_variables = ["Hour", "Day of Week"]  # , "Time of Month", "Time of Year"],
else:
    temporal_variables = ["Day of Week", "Time of Month", "Time of Year"]

In [12]:
from utils.interactive import interactive_grid_visualiser

interactive_grid_visualiser(
    grids=grids[:, 0],
    t_range=data_group.t_range,
    mutual_info=True,
    max_offset=365,
    temporal_variables=temporal_variables,
    bins=0,
)

ModuleNotFoundError: No module named 'geopy'

In [13]:
from sparse_discrete_table import conditional_mutual_information_over_grid, mutual_information_over_grid

In [14]:
from utils.plots import plot

In [15]:
grids = data_group.to_counts(sparse_data=data_group.crimes)

squeezed_grids = data_group.shaper.squeeze(grids)

log.info(f"temporal_variables: {temporal_variables}")

cmi_grid = conditional_mutual_information_over_grid(
    dense_grid=squeezed_grids[:, 0],
    t_range=data_group.t_range,
    max_offset=366,
    temporal_variables=temporal_variables
)

2021-09-18T16:58:43 | root | INFO | temporal_variables: ['Day of Week', 'Time of Month', 'Time of Year']
2021-09-18T16:58:43 | root | INFO | => 0000/0087 => 0.000


KeyboardInterrupt: 

In [None]:
cmi_grid_ = np.expand_dims(np.swapaxes(cmi_grid, 0, 1), 1)
cmi_grid_ = data_group.shaper.unsqueeze(cmi_grid_)

interactive_grid_visualiser(
    grids=cmi_grid_[:, 0],
    t_range=np.arange(len(cmi_grid_[:, 0])),
)

In [None]:
mi_grid = mutual_information_over_grid(
    dense_grid=squeezed_grids[:, 0],
    max_offset=366,
)

In [None]:
mi_grid_ = np.expand_dims(np.swapaxes(mi_grid, 0, 1), 1)
mi_grid_ = data_group.shaper.unsqueeze(mi_grid_)

interactive_grid_visualiser(
    grids=mi_grid_[:, 0],
    t_range=np.arange(len(mi_grid_[:, 0])),
)

In [None]:
isinstance(data_group.t_range, pd.core.indexes.datetimes.DatetimeIndex)

In [None]:
kwargs = {
    "MI": mi_grid_[:, 0],
    "CMI": cmi_grid_[:, 0],
}

InteractiveHeatmapsWithLines(
    date_range=np.arange(1, 1 + len(mi_grid_[:, 0])),
    col_wrap=2,
    **kwargs,
).app

In [None]:
# (l, max_offset)
mu_grid = np.mean(cmi_grid, axis=1, keepdims=True)
std_grid = np.std(cmi_grid, axis=1, keepdims=True)
dev_grid = np.abs(cmi_grid - mu_grid) / std_grid
# dev_grid = (cmi_grid - mu_grid)/std_grid

In [None]:
plot(a=dev_grid[0, :])

In [None]:
dev2weeks = dev_grid[:, :14].mean(1)
dev2weeks = np.reshape(dev2weeks, (1, 1, -1))
dev2weeks_grid = data_group.shaper.unsqueeze(dev2weeks)[0, 0]
new_interactive_heatmap(z=dev2weeks_grid)

In [None]:
def gradient(data):
    x = np.arange(0, len(data))
    y = np.array(data)
    grad, bias = np.polyfit(x, y, deg=1)
    return grad

In [None]:
dev_sum = []
for dev in dev_grid:
    #     dev_sum.append(dev[0])
    #     dev_sum.append(dev[:10].sum())
    dev_sum.append(len(dev[dev > 3]))
#     dev_sum.append(gradient(dev[:10]))
dev_sum = np.array(dev_sum)
dev_sum = np.reshape(dev_sum, (1, 1, -1))
dev_sum_grid = data_group.shaper.unsqueeze(dev_sum)[0, 0]

new_interactive_heatmap(z=dev_sum_grid)

In [None]:
from utils.interactive import interactive_grid_visualiser

interactive_grid_visualiser(
    grids=grids[:, 0],
    t_range=data_group.t_range,
    mutual_info=True,
    max_offset=365,
    temporal_variables=["Hour"],  # "Day of Week", "Time of Month", "Time of Year"],
)