In [1]:
import os
import sys
import glob
import pickle

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 

# ML related imports
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics.pairwise import haversine_distances


# AT utility imports
from utils.directory_structure import DATA_DIR, OUTPUT_DIR
from utils.preprocessing import read_split, pairwise_dist_imputer_catnum
from utils.custom_encoder import custom_categorical_encoder


mpl.style.use("bmh")
%config InlineBackend.figure_format = 'retina'

# Reading data

Reading the `dins` original file, which is a heterogeneous tabular dataset. Then, conduct a simple data exploration on the distribution and nature of the features.

In [2]:
new_features = False
case_name = "dins_2017_2022"

fname = os.path.join(OUTPUT_DIR, f"{case_name}_train_test_vars.pkl")
if new_features:
    os.system(f"rm {fname}")
    
    X_train_full, X_test, y_train_full, y_test, col_names = read_split(f"{case_name}.csv")
    data_to_save = {"X_train_full": X_train_full, "X_test": X_test, 
                    "y_train_full": y_train_full, "y_test": y_test}
    
    with open(fname, 'wb') as file:
        pickle.dump(data_to_save, file)
else:
    with open(fname, 'rb') as file:
        data_dict = pickle.load(file)
    X_train_full = data_dict["X_train_full"]
    y_train_full = data_dict["y_train_full"]
    X_test = data_dict["X_test"]
    y_test = data_dict["y_test"]

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69725 entries, 49272 to 49682
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ROOFCONSTRUCTION   66280 non-null  object 
 1   EAVES              66220 non-null  object 
 2   VENTSCREEN         66180 non-null  object 
 3   EXTERIORSIDING     66267 non-null  object 
 4   WINDOWPANE         66212 non-null  object 
 5   DECKPORCHONGRADE   56244 non-null  object 
 6   DECKPORCHELEVATED  56242 non-null  object 
 7   PATIOCOVER         56238 non-null  object 
 8   FENCE              56242 non-null  object 
 9   YEARBUILT          42390 non-null  float64
 10  LATITUDE           69725 non-null  float64
 11  LONGITUDE          69725 non-null  float64
 12  DISTANCE           69725 non-null  float64
 13  utm_easting        69725 non-null  float64
 14  utm_northing       69725 non-null  float64
 15  utm_zone           69725 non-null  float64
dtypes: float64(7), object(9

# Data processing

The steps to data preprocessing is as follows:

1. Separate the data into train and test cases with 20% going to the test set.
2. Design imputation strategies, train and apply to the train set, and fit to the test set.
3. To enable use of a variety of models:
    - Normalize the numerical variables
    - Conduct `OneHotEncoding` on categorical variables
4. Resample to make the representation of all classes equal to in the train set.
5. If necessary do a `PCA` conversion
6. Put all steps into a pipelie under one function

## Imputation strategies

The strategy differs for each type of `categorical` and `numerical` features and even within each category

*DINS*: Adopted strategy for features with missing values in samples,

- `ROOFCONSTRUCTION`  has `82817` non-null  objects: Nearest neighboor imputation.
- `EAVES` has `82741` non-null  objects: Nearest neighboor imputation.
- `VENTSCREEN` has `82692` non-null  objects: Nearest neighboor imputation.
- `EXTERIORSIDING` has`82800` non-null  objects: Nearest neighboor imputation.
- `WINDOWPANE` has `82732` non-null objects: Nearest neighboor imputation.
- `DECKPORCHONGRADE` has `70291` non-null objects: Nearest neighboor imputation.
- `DECKPORCHELEVATED` has `70290` non-null objects: Nearest neighboor imputation.
- `PATIOCOVER` has `70286` non-null objects: Nearest neighboor imputation.
- `FENCE` has `70289` non-null objects: Nearest neighboor imputation.
- `YEARBUILT` has `53075` non-null objects: Nearest neighboor imputation.

*Wildfire cases*: Adopted strategy for features with missing values in samples,

- `ZIPCODE` has `15` non-null  floats: Reverse geoencoding can be used if this is useful. Potentially for future studies. 
- `ROOFCONSTR` has `19318`  non-null samples: Nearest neighboor imputation
- `EAVES` has `19318`  non-null samples: Nearest neighboor imputation
- `VENTSCREEN` has `19318`  non-null samples: Nearest neighboor imputation
- `EXTERIORSI` has `19318`  non-null samples: Nearest neighboor imputation
- `WINDOWPANE` has `19318`  non-null samples: Nearest neighboor imputation
- `DECKPORCHO` has `19318`  non-null samples: Nearest neighboor imputation
- `DECKPORCHE` has `19318`  non-null samples: Nearest neighboor imputation
- `PATIOCOVER` has `19318`  non-null samples: Nearest neighboor imputation
- `FENCEATTAC` has `19317`  non-null samples: Nearest neighboor imputation
- `YEARBUILT ` has `22501`  non-null samples: Nearest neighboor imputation or median
- `VSD` has `3504 ` non-null  samples: Aggregate (mean, median, etc)
- `EMBER` has `11549`  non-null samples: Aggregate (mean, median, etc) potentially with KNN
- `FLAME` has `14578`  non-null samples: Aggregate (mean, median, etc) potentially with KNN 


# Impute data

Imputation is done based on the statistics of the $k$ nearest neighbor points. For numerical values the aggregation is done using `mean` and for categorical the aggregation is done by `mode`

In [3]:
new_features = False
fname = os.path.join(OUTPUT_DIR, "dins_imputed_dataset.pkl")

if new_features:
    X_train_nan_cols = X_train_full.columns[X_train_full.isna().any()].tolist()
    X_train_full = pairwise_dist_imputer_catnum(X_train_full, nan_cols=X_train_nan_cols)
    
    X_test_nan_cols = X_test.columns[X_test.isna().any()].tolist()
    X_test = pairwise_dist_imputer_catnum(X_test, nan_cols=X_test_nan_cols)
    
    data_dict = {"X_train": X_train_full, "y_train": y_train_full, 
                 "X_test": X_test, "y_test": y_test}
    
    with open(fname, 'wb') as file:
        pickle.dump(data_dict, file)
else:
    with open(fname, 'rb') as file:
        imputed_data = pickle.load(file)
        
    X_train = imputed_data["X_train"]
    y_train = imputed_data["y_train"]
    X_test  = imputed_data["X_test"]
    y_test  = imputed_data["y_test"]

# Encoding

In [5]:
# encoding features
feature_encoder = custom_categorical_encoder()
X_train_encoded = feature_encoder.fit_transform(X_train)
X_test_encoded  = feature_encoder.transform(X_test)