In [1]:
import os
import sys
import glob
import pickle

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 

# ML related imports
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics.pairwise import haversine_distances


# AT utility imports
from utils.directory_structure import DATA_DIR, OUTPUT_DIR
from utils.preprocessing import read_split, pairwise_dist_imputer_catnum
from utils.custom_encoder import custom_categorical_encoder


mpl.style.use("bmh")
%config InlineBackend.figure_format = 'retina'

# Reading data

Reading the `dins` original file, which is a heterogeneous tabular dataset. Then, conduct a simple data exploration on the distribution and nature of the features.

In [2]:
new_features = False
case_name = "dins_2017_2022"

fname = os.path.join(OUTPUT_DIR, f"{case_name}_train_test_vars.pkl")
if new_features:
    os.system(f"rm {fname}")
    
    X_train_full, X_test, y_train_full, y_test, col_names = read_split(f"{case_name}.csv")
    data_to_save = {"X_train_full": X_train_full, "X_test": X_test, 
                    "y_train_full": y_train_full, "y_test": y_test}
    
    with open(fname, 'wb') as file:
        pickle.dump(data_to_save, file)
else:
    with open(fname, 'rb') as file:
        data_dict = pickle.load(file)
    X_train_full = data_dict["X_train_full"]
    y_train_full = data_dict["y_train_full"]
    X_test = data_dict["X_test"]
    y_test = data_dict["y_test"]

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69725 entries, 49272 to 49682
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ROOFCONSTRUCTION   66280 non-null  object 
 1   EAVES              66220 non-null  object 
 2   VENTSCREEN         66180 non-null  object 
 3   EXTERIORSIDING     66267 non-null  object 
 4   WINDOWPANE         66212 non-null  object 
 5   DECKPORCHONGRADE   56244 non-null  object 
 6   DECKPORCHELEVATED  56242 non-null  object 
 7   PATIOCOVER         56238 non-null  object 
 8   FENCE              56242 non-null  object 
 9   YEARBUILT          42390 non-null  float64
 10  LATITUDE           69725 non-null  float64
 11  LONGITUDE          69725 non-null  float64
 12  DISTANCE           69725 non-null  float64
 13  utm_easting        69725 non-null  float64
 14  utm_northing       69725 non-null  float64
 15  utm_zone           69725 non-null  float64
dtypes: float64(7), object(9

# Data processing

## Impute data

Imputation is done based on the statistics of the $k$ nearest neighbor points. For numerical values the aggregation is done using `mean` and for categorical the aggregation is done by `mode`

In [3]:
new_features = False
fname = os.path.join(OUTPUT_DIR, "dins_imputed_dataset.pkl")

if new_features:
    X_train_nan_cols = X_train_full.columns[X_train_full.isna().any()].tolist()
    X_train_full = pairwise_dist_imputer_catnum(X_train_full, nan_cols=X_train_nan_cols)
    
    X_test_nan_cols = X_test.columns[X_test.isna().any()].tolist()
    X_test = pairwise_dist_imputer_catnum(X_test, nan_cols=X_test_nan_cols)
    
    data_dict = {"X_train": X_train_full, "y_train": y_train_full, 
                 "X_test": X_test, "y_test": y_test}
    
    with open(fname, 'wb') as file:
        pickle.dump(data_dict, file)
else:
    with open(fname, 'rb') as file:
        imputed_data = pickle.load(file)
        
    X_train = imputed_data["X_train"]
    y_train = imputed_data["y_train"]
    X_test  = imputed_data["X_test"]
    y_test  = imputed_data["y_test"]

## Encoding

In [5]:
# encoding features
feature_encoder = custom_categorical_encoder()
X_train_encoded = feature_encoder.fit_transform(X_train)
X_test_encoded  = feature_encoder.transform(X_test)