In [77]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import BinaryRelevance
# from skmultilearn.ensemble import
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import RakelO

In [2]:
df = pd.read_parquet('combined_training_set.parquet')
print(df.shape)
df.head()

(5894, 21)


Unnamed: 0,incidentId,state,incidentType,region,maType,maPriority,supportFunction,agencyId,maId,declarationType,...,statementOfWork,index,femaDeclarationString,incidentBeginDate,fipsStateCode,designatedIncidentTypes,declarationTitle,year,month,day
45,2018072401,CA,Fire,9,FOS,High,13.0,DHS-FPS,4382DRCAFPS01,DR,...,"As directed by an in coordination with FEMA, F...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
66,2018072401,CA,Fire,9,FOS,High,10.0,EPA,4382DRCAEPA01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
136,2018072401,CA,Fire,9,FOS,Lifesaving,1.0,DOT,4382DRCADOT01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
159,2018072401,CA,Fire,9,FOS,High,12.0,DOE,4382DRCADOE01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
206,2018072401,CA,Fire,9,FOS,Lifesaving,5.0,DOD,4382DRCADOD01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0


In [16]:
df_predict = df[[
    'state',
    'incidentType',
    'region',
    'supportFunction',
    ]]
X_train = df[[
    'state',
    'incidentType',
    'region',
    ]]
y_train = df['supportFunction']
y_train = y_train.to_frame(name='supportFunction')

categorical_cols = X_train.columns.tolist()


In [17]:
y_train

Unnamed: 0,supportFunction
45,13.0
66,10.0
136,1.0
159,12.0
206,5.0
...,...
7607,8.0
7608,12.0
7609,0.0
7610,3.0


In [18]:
preprocessor_X = ColumnTransformer(
    transformers=[
        ('onehot', 
         OneHotEncoder(handle_unknown='ignore', sparse_output=False), # sparse_output=False returns a dense array
         categorical_cols)
    ],
    remainder='passthrough' # Keep other columns as they are
)

preprocessor_y = ColumnTransformer(
    transformers=[
        ('onehot', 
         OneHotEncoder(handle_unknown='ignore', sparse_output=False), # sparse_output=False returns a dense array
         ['supportFunction'])
    ],
    remainder='passthrough' # Keep other columns as they are
)

In [19]:
X_train_transformed = preprocessor_X.fit_transform(X_train)
feature_names = preprocessor_X.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)

print(X_train_transformed.head())

   onehot__state_AK  onehot__state_AL  onehot__state_AR  onehot__state_AS  \
0               0.0               0.0               0.0               0.0   
1               0.0               0.0               0.0               0.0   
2               0.0               0.0               0.0               0.0   
3               0.0               0.0               0.0               0.0   
4               0.0               0.0               0.0               0.0   

   onehot__state_AZ  onehot__state_CA  onehot__state_CO  onehot__state_CT  \
0               0.0               1.0               0.0               0.0   
1               0.0               1.0               0.0               0.0   
2               0.0               1.0               0.0               0.0   
3               0.0               1.0               0.0               0.0   
4               0.0               1.0               0.0               0.0   

   onehot__state_DC  onehot__state_DE  ...  onehot__region_1  \
0         

In [30]:
y_train_transformed = preprocessor_y.fit_transform(y_train)
feature_names = preprocessor_y.get_feature_names_out()
y_train_transformed = pd.DataFrame(y_train_transformed, columns=feature_names)

y_train_transformed.head()

Unnamed: 0,onehot__supportFunction_0.0,onehot__supportFunction_1.0,onehot__supportFunction_2.0,onehot__supportFunction_3.0,onehot__supportFunction_4.0,onehot__supportFunction_5.0,onehot__supportFunction_6.0,onehot__supportFunction_7.0,onehot__supportFunction_8.0,onehot__supportFunction_9.0,onehot__supportFunction_10.0,onehot__supportFunction_11.0,onehot__supportFunction_12.0,onehot__supportFunction_13.0,onehot__supportFunction_14.0,onehot__supportFunction_15.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
br_classifier = BinaryRelevance(
    classifier = SVC(gamma='auto'), 
    require_dense = [True, True]
)
br_classifier.fit(X_train_transformed, y_train_transformed)

In [27]:
predictions = br_classifier.predict(X_train_transformed)



In [28]:
predictions_dense = predictions.toarray()

In [None]:
print("--- Training Complete ---")
print(f"Number of individual classifiers trained: {len(br_classifier.classifiers_)}")

print("\n--- Prediction Results ---")
print(f"True Multi-Label Targets (Y_train):\n{y_train_transformed}")
print(f"\nPredicted Multi-Label Targets:\n{predictions_dense}")

# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy = accuracy_score(y_train_transformed, predictions_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy:.2f}")


--- Training Complete ---
Number of individual classifiers trained: 16

--- Prediction Results ---
True Multi-Label Targets (Y_test):
      onehot__supportFunction_0.0  onehot__supportFunction_1.0  \
0                             0.0                          0.0   
1                             0.0                          0.0   
2                             0.0                          1.0   
3                             0.0                          0.0   
4                             0.0                          0.0   
...                           ...                          ...   
5889                          0.0                          0.0   
5890                          0.0                          0.0   
5891                          1.0                          0.0   
5892                          0.0                          0.0   
5893                          0.0                          0.0   

      onehot__supportFunction_2.0  onehot__supportFunction_3.0  \
0      

In [47]:
# Define the base classifier (e.g., Support Vector Classifier)
base_classifier = SVC(gamma='auto',
                      kernel='sigmoid',
                    #   class_weight='balanced',
                      probability=True) 

# Initialize the Ensemble Classifier Chain
# n_chains: The number of chains in the ensemble (10 random orders)
# base_classifier is wrapped in the ClassifierChain structure
ecc_classifier = ClassifierChain(
    classifier=base_classifier, 
    # chains=10, 
    require_dense=[True, True]
)

# Train the model
# ECC trains 10 independent Classifier Chains, each with a random label order
ecc_classifier.fit(X_train_transformed, y_train_transformed)


In [51]:
predictions_ecc = ecc_classifier.predict(X_train_transformed)
predictions_ecc_dense = predictions_ecc.toarray()

In [52]:
# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy = accuracy_score(y_train_transformed, predictions_ecc_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy:.2f}")


Subset Accuracy (Exact Match): 0.13


In [58]:
print(predictions_ecc_dense[15])
print(y_train_transformed.iloc[15])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
onehot__supportFunction_0.0     0.0
onehot__supportFunction_1.0     0.0
onehot__supportFunction_2.0     0.0
onehot__supportFunction_3.0     0.0
onehot__supportFunction_4.0     0.0
onehot__supportFunction_5.0     0.0
onehot__supportFunction_6.0     0.0
onehot__supportFunction_7.0     0.0
onehot__supportFunction_8.0     0.0
onehot__supportFunction_9.0     0.0
onehot__supportFunction_10.0    0.0
onehot__supportFunction_11.0    0.0
onehot__supportFunction_12.0    0.0
onehot__supportFunction_13.0    1.0
onehot__supportFunction_14.0    0.0
onehot__supportFunction_15.0    0.0
Name: 15, dtype: float64


In [64]:
rows_with_any_one = np.any(predictions_ecc_dense == 1, axis = 1)
print(len(rows_with_any_one))
print(len(np.where(rows_with_any_one)[0]))
np.where(rows_with_any_one)[0]

5894
1902


array([1343, 1344, 1345, ..., 5884, 5885, 5886])

In [68]:
print(predictions_ecc_dense[5884])
print(y_train_transformed.iloc[5884])

[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
onehot__supportFunction_0.0     0.0
onehot__supportFunction_1.0     0.0
onehot__supportFunction_2.0     0.0
onehot__supportFunction_3.0     0.0
onehot__supportFunction_4.0     0.0
onehot__supportFunction_5.0     0.0
onehot__supportFunction_6.0     0.0
onehot__supportFunction_7.0     0.0
onehot__supportFunction_8.0     1.0
onehot__supportFunction_9.0     0.0
onehot__supportFunction_10.0    0.0
onehot__supportFunction_11.0    0.0
onehot__supportFunction_12.0    0.0
onehot__supportFunction_13.0    0.0
onehot__supportFunction_14.0    0.0
onehot__supportFunction_15.0    0.0
Name: 5884, dtype: float64


In [75]:
df.iloc[5884]

incidentId                                                        2020030901
state                                                                     DE
incidentType                                                      Biological
region                                                                     3
maType                                                                   DFA
maPriority                                                              High
supportFunction                                                          8.0
agencyId                                                                 HHS
maId                                                           4526DRDEHHS06
declarationType                                                           DR
assistanceRequested        Clinical Staff (Non-Vaccinators)-- \r\nDelawar...
statementOfWork            In support of Delaware request, as directed by...
index                                                                 4949.0

In [76]:
df[df['supportFunction']==8.0]

Unnamed: 0,incidentId,state,incidentType,region,maType,maPriority,supportFunction,agencyId,maId,declarationType,...,statementOfWork,index,femaDeclarationString,incidentBeginDate,fipsStateCode,designatedIncidentTypes,declarationTitle,year,month,day
671,2023010401,CA,Flood,9,FOS,High,8.0,HHS-ASPR,4683DRCAHHS-ASPR01,DR,...,"As directed by and in coordination with FEMA, ...",63821.0,DR-4683-CA,2022-12-27 00:00:00+00:00,6.0,"5,M,W,F","SEVERE WINTER STORMS, FLOODING, LANDSLIDES, AN...",2022.0,12.0,27.0
764,2022092201,FL,Hurricane,4,FOS,Lifesaving,8.0,HHS-ASPR,4673DRFLHHS-ASPR08,DR,...,"As directed by and in coordination with FEMA, ...",1685.0,DR-4675-FL,2022-09-23 00:00:00+00:00,12.0,,HURRICANE IAN - SEMINOLE TRIBE OF FLORIDA,2022.0,9.0,23.0
765,2022092201,FL,Hurricane,4,FOS,Lifesaving,8.0,HHS-ASPR,4673DRFLHHS-ASPR08,DR,...,"As directed by and in coordination with FEMA, ...",1693.0,DR-4673-FL,2022-09-23 00:00:00+00:00,12.0,,HURRICANE IAN,2022.0,9.0,23.0
766,2022092201,FL,Hurricane,4,FOS,High,8.0,HHS-ASPR,4673DRFLHHS-ASPR07,DR,...,"As directed by and in coordination with FEMA, ...",1685.0,DR-4675-FL,2022-09-23 00:00:00+00:00,12.0,,HURRICANE IAN - SEMINOLE TRIBE OF FLORIDA,2022.0,9.0,23.0
767,2022092201,FL,Hurricane,4,FOS,High,8.0,HHS-ASPR,4673DRFLHHS-ASPR07,DR,...,"As directed by and in coordination with FEMA, ...",1693.0,DR-4673-FL,2022-09-23 00:00:00+00:00,12.0,,HURRICANE IAN,2022.0,9.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7318,2020030901,VI,Biological,2,FOS,Normal,8.0,HHS,4513DRVIHHS03,DR,...,"As directed by and in coordination with FEMA, ...",6105.0,DR-4513-VI,2020-01-20 00:00:00+00:00,78.0,,COVID-19 PANDEMIC,2020.0,1.0,20.0
7357,2020030901,AS,Biological,9,DFA,Life Sustaining,8.0,VA,4537DRASVA04,DR,...,VHA as directed by and in coordination with th...,4139.0,DR-4537-AS,2020-01-20 00:00:00+00:00,60.0,,COVID-19 PANDEMIC,2020.0,1.0,20.0
7381,2020030901,DE,Biological,3,DFA,High,8.0,HHS,4526DRDEHHS06,DR,...,"In support of Delaware request, as directed by...",4949.0,DR-4526-DE,2020-01-20 00:00:00+00:00,10.0,,COVID-19 PANDEMIC,2020.0,1.0,20.0
7468,2020030901,NY,Biological,2,DFA,Normal,8.0,HHS-ASPR,4480DRNYHHS-ASPR06,DR,...,As directed by and in coordination with FEMA i...,8368.0,DR-4480-NY,2020-01-20 00:00:00+00:00,36.0,,COVID-19 PANDEMIC,2020.0,1.0,20.0


In [82]:
classifier_rake = RakelO(
    base_classifier=GaussianNB(),
    base_classifier_require_dense=[True, True],
    labelset_size=16,
    model_count=30
)

classifier_rake.fit(X_train_transformed, y_train_transformed)
predictions_rake = classifier_rake.predict(X_train_transformed)



In [83]:
predictions_rake_dense = predictions_rake.toarray()

In [84]:
# A simple (though not ideal) evaluation metric for multi-label
subset_accuracy_rake = accuracy_score(y_train_transformed, predictions_rake_dense)
print(f"\nSubset Accuracy (Exact Match): {subset_accuracy_rake:.2f}")


Subset Accuracy (Exact Match): 0.04


In [85]:
rows_with_any_one_rake = np.any(predictions_rake_dense == 1, axis = 1)
print(len(rows_with_any_one_rake))
print(len(np.where(rows_with_any_one_rake)[0]))
np.where(rows_with_any_one_rake)[0]

5894
5894


array([   0,    1,    2, ..., 5891, 5892, 5893])

In [91]:
print(predictions_rake_dense[0])
print(y_train_transformed.iloc[0])

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
onehot__supportFunction_0.0     0.0
onehot__supportFunction_1.0     0.0
onehot__supportFunction_2.0     0.0
onehot__supportFunction_3.0     0.0
onehot__supportFunction_4.0     0.0
onehot__supportFunction_5.0     0.0
onehot__supportFunction_6.0     0.0
onehot__supportFunction_7.0     0.0
onehot__supportFunction_8.0     0.0
onehot__supportFunction_9.0     0.0
onehot__supportFunction_10.0    0.0
onehot__supportFunction_11.0    0.0
onehot__supportFunction_12.0    0.0
onehot__supportFunction_13.0    1.0
onehot__supportFunction_14.0    0.0
onehot__supportFunction_15.0    0.0
Name: 0, dtype: float64


In [90]:
print(df.iloc[5884])

incidentId                                                        2020030901
state                                                                     DE
incidentType                                                      Biological
region                                                                     3
maType                                                                   DFA
maPriority                                                              High
supportFunction                                                          8.0
agencyId                                                                 HHS
maId                                                           4526DRDEHHS06
declarationType                                                           DR
assistanceRequested        Clinical Staff (Non-Vaccinators)-- \r\nDelawar...
statementOfWork            In support of Delaware request, as directed by...
index                                                                 4949.0

In [92]:
df.head(5)

Unnamed: 0,incidentId,state,incidentType,region,maType,maPriority,supportFunction,agencyId,maId,declarationType,...,statementOfWork,index,femaDeclarationString,incidentBeginDate,fipsStateCode,designatedIncidentTypes,declarationTitle,year,month,day
45,2018072401,CA,Fire,9,FOS,High,13.0,DHS-FPS,4382DRCAFPS01,DR,...,"As directed by an in coordination with FEMA, F...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
66,2018072401,CA,Fire,9,FOS,High,10.0,EPA,4382DRCAEPA01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
136,2018072401,CA,Fire,9,FOS,Lifesaving,1.0,DOT,4382DRCADOT01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
159,2018072401,CA,Fire,9,FOS,High,12.0,DOE,4382DRCADOE01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
206,2018072401,CA,Fire,9,FOS,Lifesaving,5.0,DOD,4382DRCADOD01,DR,...,"As directed by and in coordination with FEMA, ...",10006.0,DR-4382-CA,2018-07-23 00:00:00+00:00,6.0,,WILDFIRES AND HIGH WINDS,2018.0,7.0,23.0
