## Feature Selection with Mutual Information

The aim of this notebook:
- Look at column features and how they influence a cell's self-exciting property (mutual info)

In [1]:
import os
import logging as log
from time import strftime
from copy import deepcopy
from torch import nn, optim
import torch.nn.functional as F
from utils.data_processing import *
from logger.logger import setup_logging
from utils.configs import BaseConf
from utils.utils import write_json, Timer, get_data_sub_paths, pshape, get_data_resolutions
from models.kangkang_fnn_models import KangFeedForwardNetwork, SimpleKangFNN
from dataloaders.flat_loader import FlatDataLoaders, MockLoader, MockLoaders
from datasets.flat_dataset import FlatDataGroup

from dataloaders.cell_loader import CellDataLoaders
from datasets.cell_dataset import CellDataGroup

from utils.metrics import PRCurvePlotter, ROCCurvePlotter, LossPlotter, PerTimeStepPlotter
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score
from models.model_result import ModelResult, ModelMetrics, save_metrics
from utils.mock_data import mock_fnn_data_classification
from utils.plots import im
from trainers.generic_trainer import train_model
from models.kangkang_fnn_models import train_epoch_for_fnn
from utils.configs import BaseConf
from utils.metrics import best_threshold, get_y_pred
from dataloaders.grid_loader import GridDataLoaders
from datasets.grid_dataset import GridDataGroup

from utils.mock_data import mock_fnn_data_classification
import matplotlib.pyplot as plt
from utils.plots import im
from utils.metrics import best_threshold, get_y_pred, get_y_pred_by_thresholds, best_thresholds
from models.model_result import ModelResult, ModelMetrics, save_metrics, compare_all_models,\
                                get_models_metrics, get_models_results
from pprint import pprint
from time import time
from utils.setup import setup

In [20]:
conf = BaseConf()

conf.model_name = f"Mutual Info"

conf.data_path = f"./data/processed/{data_sub_path}/"

if not os.path.exists(conf.data_path):
    raise Exception(f"Directory ({conf.data_path}) needs to exist.")

conf.model_path = f"{conf.data_path}models/{conf.model_name}/"
os.makedirs(conf.data_path, exist_ok=True)
os.makedirs(conf.model_path, exist_ok=True)

setup_logging(save_dir=conf.model_path,
              log_config='./logger/standard_logger_config.json',
              default_level=log.INFO)

with np.load(conf.data_path + "generated_data.npz") as zip_file:  # context helper ensures zip_file is closed
    sparse_crimes = zip_file["crime_types_grids"]
    crime_feature_indices = zip_file["crime_feature_indices"]


shaper = Shaper(data=sparse_crimes,
                conf=conf)

#     return conf, shaper, sparse_crimes, crime_feature_indices


In [2]:
data_sub_paths = get_data_sub_paths()
pprint(np.sort(data_sub_paths))

data_sub_path = 'T24H-X850M-Y880M_2013-01-01_2017-01-01'

array(['T12H-X850M-Y880M_2013-01-01_2017-01-01',
       'T1H-X1700M-Y1760M_2013-01-01_2017-01-01',
       'T24H-X1275M-Y1320M_2012-01-01_2019-01-01',
       'T24H-X1700M-Y1760M_2012-01-01_2019-01-01',
       'T24H-X255M-Y220M_2013-01-01_2017-01-01',
       'T24H-X425M-Y440M_2012-01-01_2019-01-01',
       'T24H-X425M-Y440M_2013-01-01_2017-01-01',
       'T24H-X850M-Y880M_2012-01-01_2019-01-01',
       'T24H-X850M-Y880M_2013-01-01_2017-01-01',
       'T24H-X85M-Y110M_2013-01-01_2017-01-01',
       'T3H-X850M-Y880M_2013-01-01_2017-01-01',
       'T6H-X850M-Y880M_2013-01-01_2017-01-01'], dtype='<U40')


In [3]:
conf, shaper, sparse_crimes, crime_feature_indices = setup(data_sub_path=data_sub_path)

In [4]:
from utils.preprocessing import Shaper

In [5]:
for i,k in enumerate(crime_feature_indices):
    print(f"'{k}':{i},")

'TOTAL':0,
'THEFT':1,
'BATTERY':2,
'CRIMINAL DAMAGE':3,
'NARCOTICS':4,
'ASSAULT':5,
'BURGLARY':6,
'MOTOR VEHICLE THEFT':7,
'ROBBERY':8,
'Arrest':9,


In [6]:
i = 0
crimes = sparse_crimes[:,i:i+1]
print(conf.shaper_threshold) # sum over all time should be above this threshold
print(conf.shaper_top_k) # if larger than 0 we filter out only the top k most active cells of the data grid
new_shaper = Shaper(crimes, conf)

0
-1


In [11]:
dense_crimes = shaper.squeeze(sparse_crimes)
from utils.utils import describe_array

In [19]:
print(describe_array(dense_crimes[:,4]))

____________________________________________________________
{'max': 43.0,
 'mean': 0.084798899185383,
 'min': 0.0,
 'shape': (1461, 772),
 'std': 0.37116642461852944}
____________________________________________________________


In [56]:
dense_crimes[dense_crimes > 0] = 1

In [57]:
c_now = dense_crimes[:-1,1:-1]
c_prev = dense_crimes[1:,1:-1]

In [58]:
c_now_flt = np.reshape(c_now.swapaxes(1,2),(-1,8))
c_prev_flt = np.reshape(c_prev.swapaxes(1,2),(-1,8))

In [59]:
df_now = pd.DataFrame(c_now_flt,columns=crime_feature_indices[1:-1])
df_prev = pd.DataFrame(c_prev_flt,columns=crime_feature_indices[1:-1])

In [60]:
df_now.corr()

Unnamed: 0,THEFT,BATTERY,CRIMINAL DAMAGE,NARCOTICS,ASSAULT,BURGLARY,MOTOR VEHICLE THEFT,ROBBERY
THEFT,1.0,0.105854,0.075943,0.068029,0.066417,0.050817,0.047021,0.060383
BATTERY,0.105854,1.0,0.101798,0.139691,0.104406,0.061343,0.049696,0.079825
CRIMINAL DAMAGE,0.075943,0.101798,1.0,0.074156,0.061469,0.051847,0.038608,0.04731
NARCOTICS,0.068029,0.139691,0.074156,1.0,0.086472,0.047265,0.044761,0.069765
ASSAULT,0.066417,0.104406,0.061469,0.086472,1.0,0.036869,0.031553,0.050848
BURGLARY,0.050817,0.061343,0.051847,0.047265,0.036869,1.0,0.02823,0.033261
MOTOR VEHICLE THEFT,0.047021,0.049696,0.038608,0.044761,0.031553,0.02823,1.0,0.029594
ROBBERY,0.060383,0.079825,0.04731,0.069765,0.050848,0.033261,0.029594,1.0


In [66]:
X = dense_crimes[:-1,:-1].swapaxes(1,2).reshape(-1,9)
y = dense_crimes[1:,0].reshape(-1)
y[y > 0] = 1
from sklearn.feature_selection import mutual_info_classif

In [67]:
mutual_info_classif(X,y)

array([0.07804144, 0.02078052, 0.02245209, 0.01046461, 0.01601758,
       0.0073597 , 0.00517525, 0.00221414, 0.00494581])