In [1]:
import os
import sys
import glob
import pickle

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# ML related imports
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics.pairwise import haversine_distances
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import KMeansSMOTE


# AT utility imports
from utils.directory_structure import DATA_DIR, OUTPUT_DIR
from utils.preprocessing import data_preprocessing_pipeline


mpl.style.use("bmh")
%config InlineBackend.figure_format = 'retina'

# Data preprocessing pipeline

In [4]:
## creating encoded features for binary classification
case_list = ["Tubbs", "Camp", "Glass", "Kincade", "Thomas", "dins_2017_2022"]

for case in case_list:
    _ = data_preprocessing_pipeline(case, 
                                    renew_data=True, 
                                    encode_data=True, 
                                    scale_data=True, 
                                    task_type="binary")

/home/maryamz/DINS_data_preparation/data/WUI_fires/Tubbs_original.csv
Read, feature engineer, and split between train and test
Imputation based on location information
Encoding
Normalize the required features and drop extra information!
/home/maryamz/DINS_data_preparation/data/WUI_fires/Camp_original.csv
Read, feature engineer, and split between train and test
Imputation based on location information


In [3]:
# data_file_path = os.path.join(DATA_DIR, "dins_2017_2022.csv")
# missing_values = ["", "NA", "na", "n/a", "N/A", "--", "nan", "Unknown"]
# df = pd.read_csv(data_file_path, delimiter=",", na_values=missing_values)

In [3]:
## creating features for binary classification without encoding
case_list = ["concatenated_df"]

for case in case_list:
    _ = data_preprocessing_pipeline(case, 
                                    renew_data=True, 
                                    encode_data=True, 
                                    scale_data=True, 
                                    task_type="binary")

/home/maryamz/DINS_data_preparation/data/WUI_fires/Tubbs_original.csv
Read, feature engineer, and split between train and test
Imputation based on location information
No encoding for the features!
Normalize the required features and drop extra information!


In [10]:
## creating non-encoded features for binary classification
case_list = ["dins_2017_2022"]

for case in case_list:
    _ = data_preprocessing_pipeline(case, 
                                    renew_data=True, 
                                    encode_data=False, 
                                    scale_data=True, 
                                    task_type="binary")

/home/maryamz/DINS_data_preparation/data/dins_2017_2022.csv
Read, feature engineer, and split between train and test


  df = pd.read_csv(data_file_path, delimiter=",", na_values=missing_values)


Imputation based on location information
No encoding for the features!
Normalize the required features and drop extra information!
