In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance
# from skmultilearn.ensemble import
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import RakelO

Load combined MA and DDS training data

In [68]:
df = pd.read_parquet('combined_training_set.parquet')
print(df.shape)
df.head()

(5894, 21)


Unnamed: 0,incidentId,state,incidentType,region,maType,maPriority,supportFunction,agencyId,maId,declarationType,...,statementOfWork,index,femaDeclarationString,incidentBeginDate,fipsStateCode,designatedIncidentTypes,declarationTitle,year,month,day
45,2018072401,CA,Fire,9,FOS,High,13.0,DHS-FPS,4382DRCAFPS01,DR,...,"As directed by an in coordination with FEMA, F...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
66,2018072401,CA,Fire,9,FOS,High,10.0,EPA,4382DRCAEPA01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
136,2018072401,CA,Fire,9,FOS,Lifesaving,1.0,DOT,4382DRCADOT01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
159,2018072401,CA,Fire,9,FOS,High,12.0,DOE,4382DRCADOE01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
206,2018072401,CA,Fire,9,FOS,Lifesaving,5.0,DOD,4382DRCADOD01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0


In [None]:
def create_set(
        df: pd.DataFrame,
        feature_columns: list,
        target_column: str,
        ) ->tuple[pd.DataFrame, pd.Series]:
    """
    Create X and y sets from a DataFrame and desired columns for each

    Returns X and y
    """

    # This groupby process will combine resulting target values into a list
    # This helps the model understand the connections between features and target values
    df_predict = df.copy().groupby(
    feature_columns,
    )[target_column].agg(lambda x: list(set(x))).reset_index()

    X = df_predict[feature_columns]
    y = df_predict[target_column]

    return X, y


In [70]:
feature_columns = [
    'state',
    'incidentType',
    'region',
]
target_column = 'supportFunction'

X_train, y_train = create_set(df, feature_columns, target_column)
print(X_train.shape, len(y_train))
print(y_train[:5])
X_train.head(5)

(195, 3) 195
0                                      [8.0, 0.0, 3.0]
1    [0.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 10.0, 11.0...
2                                     [3.0, 11.0, 7.0]
3                       [0.0, 1.0, 3.0, 4.0, 5.0, 7.0]
4                                      [8.0, 0.0, 3.0]
Name: supportFunction, dtype: object


Unnamed: 0,state,incidentType,region
0,AK,Biological,10
1,AK,Earthquake,10
2,AK,Flood,10
3,AK,Severe Storm,10
4,AL,Biological,4


In [115]:
def transform_x_y(
        X: pd.DataFrame,
        y: pd.Series,
        feature_columns: list,
        target_column: str,
        ) ->tuple[pd.DataFrame, pd.DataFrame, any, any]:
    """
    Transform X and y for use with future models.

    Returns X_transformer, y_transformer, transformer_X, and transformer_y
    """
    # Process X
    transformer_X = ColumnTransformer(
        transformers=[
            ('onehot', 
            OneHotEncoder(handle_unknown='ignore', sparse_output=False), # sparse_output=False returns a dense array
            feature_columns)
        ],
        remainder='passthrough' # Keep other columns as they are
    )
   
    X_transformed = transformer_X.fit_transform(X)
    feature_names_x = transformer_X.get_feature_names_out()
    X_transformed = pd.DataFrame(X_transformed, columns=feature_names_x)

    # Process y
    transformer_y = MultiLabelBinarizer()

    y_transformed = transformer_y.fit_transform(y)
    feature_names_y = transformer_y.classes_
    y_transformed = pd.DataFrame(y_transformed, columns=feature_names_y)
    
    return X_transformed, y_transformed, transformer_X, transformer_y

In [94]:
X_train_transformed, y_train_transformed, preprocessor_X, preprocessor_y = transform_x_y(
    X_train, y_train, feature_columns, target_column
)

In [117]:
print(X_train_transformed.shape)
X_train_transformed.head()

(195, 84)


Unnamed: 0,onehot__state_AK,onehot__state_AL,onehot__state_AR,onehot__state_AS,onehot__state_AZ,onehot__state_CA,onehot__state_CO,onehot__state_CT,onehot__state_DC,onehot__state_DE,...,onehot__region_1,onehot__region_2,onehot__region_3,onehot__region_4,onehot__region_5,onehot__region_6,onehot__region_7,onehot__region_8,onehot__region_9,onehot__region_10
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
print(y_train_transformed.shape)
y_train_transformed.head()

(195, 16)


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0
0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0,1,1,0,1,1,1,1,0,0
2,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
3,1,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [114]:
br_classifier = BinaryRelevance(
    classifier = SVC(gamma='auto'), 
    require_dense = [True, True]
)
br_classifier.fit(X_train_transformed, y_train_transformed)

In [96]:
predictions = br_classifier.predict(X_train_transformed)



In [97]:
predictions_dense = predictions.toarray()

In [98]:
print("--- Training Complete ---")
print(f"Number of individual classifiers trained: {len(br_classifier.classifiers_)}")

print("\n--- Prediction Results ---")
print(f"True Multi-Label Targets (Y_train):\n{y_train_transformed}")
print(f"\nPredicted Multi-Label Targets:\n{predictions_dense}")

# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy = accuracy_score(y_train_transformed, predictions_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy:.2f}")


--- Training Complete ---
Number of individual classifiers trained: 16

--- Prediction Results ---
True Multi-Label Targets (Y_train):
     0.0  1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0  10.0  11.0  12.0  13.0  \
0      1    0    0    1    0    0    0    0    1    0     0     0     0     0   
1      1    1    0    1    1    1    0    1    1    0     1     1     1     1   
2      0    0    0    1    0    0    0    1    0    0     0     1     0     0   
3      1    1    0    1    1    1    0    1    0    0     0     0     0     0   
4      1    0    0    1    0    0    0    0    1    0     0     0     0     0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   ...   ...   
190    0    0    0    1    1    0    0    1    0    0     0     0     0     1   
191    1    0    0    0    0    0    0    1    0    0     0     0     0     1   
192    0    0    0    1    0    0    0    1    0    0     0     0     0     1   
193    1    0    0    0    1    0    0    0    1    0  

In [99]:
# Define the base classifier (e.g., Support Vector Classifier)
base_classifier = SVC(gamma='auto',
                      kernel='sigmoid',
                    #   class_weight='balanced',
                      probability=True) 

# Initialize the Ensemble Classifier Chain
# n_chains: The number of chains in the ensemble (10 random orders)
# base_classifier is wrapped in the ClassifierChain structure
ecc_classifier = ClassifierChain(
    classifier=base_classifier, 
    # chains=10, 
    require_dense=[True, True]
)

# Train the model
# ECC trains 10 independent Classifier Chains, each with a random label order
ecc_classifier.fit(X_train_transformed, y_train_transformed)


In [100]:
predictions_ecc = ecc_classifier.predict(X_train_transformed)
predictions_ecc_dense = predictions_ecc.toarray()

In [101]:
# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy = accuracy_score(y_train_transformed, predictions_ecc_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy:.2f}")


Subset Accuracy (Exact Match): 0.05


In [102]:
print(predictions_ecc_dense[15])
print(y_train_transformed.iloc[15])

[1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
0.0     0
1.0     0
2.0     0
3.0     0
4.0     0
5.0     0
6.0     0
7.0     0
8.0     0
9.0     0
10.0    0
11.0    0
12.0    0
13.0    1
14.0    0
15.0    0
Name: 15, dtype: int64


In [103]:
rows_with_any_one = np.any(predictions_ecc_dense == 1, axis = 1)
print(len(rows_with_any_one))
print(len(np.where(rows_with_any_one)[0]))
np.where(rows_with_any_one)[0]

195
195


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [104]:
print(predictions_ecc_dense[0])
print(y_train_transformed.iloc[0])

[1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
0.0     1
1.0     0
2.0     0
3.0     1
4.0     0
5.0     0
6.0     0
7.0     0
8.0     1
9.0     0
10.0    0
11.0    0
12.0    0
13.0    0
14.0    0
15.0    0
Name: 0, dtype: int64


In [105]:
classifier_rake = RakelO(
    base_classifier=GaussianNB(),
    base_classifier_require_dense=[True, True],
    labelset_size=16,
    model_count=30
)

classifier_rake.fit(X_train_transformed, y_train_transformed)
predictions_rake = classifier_rake.predict(X_train_transformed)



In [106]:
predictions_rake_dense = predictions_rake.toarray()

In [107]:
# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy_rake = accuracy_score(y_train_transformed, predictions_rake_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy_rake:.2f}")


Subset Accuracy (Exact Match): 0.93


In [108]:
rows_with_any_one_rake = np.any(predictions_rake_dense == 1, axis = 1)
print(len(rows_with_any_one_rake))
print(len(np.where(rows_with_any_one_rake)[0]))
np.where(rows_with_any_one_rake)[0]

195
195


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [109]:
print(predictions_rake_dense[0])
print(y_train_transformed.iloc[0])

[1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0]
0.0     1
1.0     0
2.0     0
3.0     1
4.0     0
5.0     0
6.0     0
7.0     0
8.0     1
9.0     0
10.0    0
11.0    0
12.0    0
13.0    0
14.0    0
15.0    0
Name: 0, dtype: int64


In [110]:
incorrect_predictions = (predictions_rake_dense != y_train_transformed)
incorrect_predictions = np.any(incorrect_predictions, axis=1)
np.where(incorrect_predictions)[0]

array([  4,  64,  94, 101, 114, 115, 120, 125, 152, 163, 178, 185, 192])

In [111]:
print(predictions_rake_dense[4])
print(X_train.iloc[4], y_train_transformed.iloc[4])

[1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
state                   AL
incidentType    Biological
region                   4
Name: 4, dtype: object 0.0     1
1.0     0
2.0     0
3.0     1
4.0     0
5.0     0
6.0     0
7.0     0
8.0     1
9.0     0
10.0    0
11.0    0
12.0    0
13.0    0
14.0    0
15.0    0
Name: 4, dtype: int64
