In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import clone
from scipy import stats
import sklearn.preprocessing
import pickle
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random



In [2]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("Trec_data/Preprocessed_labelled.json", orient='records', lines=True)
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos,regression_priority,sparseCategories
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,28,0,29,-0.3400,0.091,0.909,0.000,0.25,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,10,11,1,11,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,25,0,28,0.0000,0.000,1.000,0.000,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,26,29,0,32,0.0552,0.177,0.610,0.214,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,23,25,0,26,0.0000,0.000,1.000,0.000,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,24,21,1,24,-0.3412,0.099,0.901,0.000,0.25,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,16,16,1,18,-0.0516,0.173,0.663,0.163,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,10,12,1,13,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,12,13,1,13,-0.8271,0.490,0.510,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ..."


# **`Generate Event Types`**

In [3]:
fullEventTypes = df['eventType'].unique()
eventTypes = []
for event in fullEventTypes:
    events = df.loc[df['eventType'] == event]['eventID'].unique()
    if events.size > 1:
        eventTypes.append(event)
print(eventTypes)

['wildfire', 'earthquake', 'flood', 'typhoon', 'shooting', 'bombing', 'covid', 'explosion', 'storm']


In [4]:
#Generate Event Ensembles
#Similar events is based of the Pairs of Event Types similarity matrix found in the ReadMe
EventEnsembles = {}
EventEnsembles['wildfire'] = ['wildfire', 'flood', 'typhoon']
EventEnsembles['earthquake'] = ['earthquake', 'typhoon', 'flood']
EventEnsembles['flood'] = ['flood', 'typhoon', 'wildfire', 'earthquake']
EventEnsembles['typhoon'] = ['typhoon', 'flood', 'storm']
EventEnsembles['shooting'] = ['shooting', 'bombing', 'hostage', 'covid']
EventEnsembles['bombing'] = ['bombing', 'typhoon', 'earthquake']
EventEnsembles['covid'] = ['covid', 'typhoon', 'earthquake']
EventEnsembles['explosion'] = ['explosion', 'tornade', 'fire', 'wildfire']
EventEnsembles['storm'] = ['storm', 'tornado', 'typhoon', 'flood']

# **`Model Related Methods`**

In [5]:
def train_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [6]:
#Currently unused
def save_model(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename):
    model = pickle.load(open(filename, 'rb'))
    return model

# **`Generate Generic Variables`**

In [7]:
features = ["num_chars", "num_chars_total", 
            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
            "vader neu", "vader compound", 
            "num_hashtags", "num_mentions", 
            "num_urls", 
            "is_retweet", "num_media",
            "is_verified", 
            "caps_ratio"]

#I think you need to make a list of lists

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
    'verbose': 0
}

# **`Generate and Test postPriority Models`**

In [8]:
def generate_scores_by_event_Prio(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    labels = data[target].unique()
    label_f1_accum = {} #Dict by unique labels
    label_score_accum = {}
    for label in labels:
        label_f1_accum[label] = []
        label_score_accum[label] = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        y_train = training[target]        
        X_test = test[features]
        y_test = test[target]
        
        if isinstance(y_train, pd.Series):
            y_train = []
            for val in training[target]:
                y_train.append(np.array(val))
            y_train= np.array(y_train)
        if isinstance(y_test, pd.Series):
            y_test = []
            for val in test[target]:
                y_test.append(np.array(val))
            y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
        #Seperate scores per label
        for label in labels:
            label_ids = test[target]==label
            x_label = X_test[label_ids]
            y_label = y_test[label_ids]
            y_infer_label = y_infer_local[label_ids]
            
            if x_label.size == 0:
                continue
            
            label_f1 = f1_score(y_label, y_infer_label, average="macro", zero_division=0)
            label_score = model.score(x_label, y_label)
            
            label_f1_accum[label].append(label_f1)
            label_score_accum[label].append(label_score)
        
        
    for label in labels: #Prevent blank
        if len(label_f1_accum[label])==0:
            label_f1_accum.pop(label, None)
            label_score_accum.pop(label, None)
        
    return [accuracy_accum, f1_accum, label_score_accum, label_f1_accum] #Accuracy is 0, F1 is 1, label Acc is 2, label F1 is 3

In [9]:
prioLabel = 'postPriority'
prioModel = RandomForestClassifier(**rf_params)

#genPrioScores = {}
ensPrioScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType'].isin(EventEnsembles[event])]
    #genPrioScores[event] = generate_scores_by_event_Prio(df, event, features, prioLabel, prioModel)
    ensPrioScores[event] = generate_scores_by_event_Prio(eventDF, event, features, prioLabel, prioModel)
    

Events:   0%|          | 0/9 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:09<00:39,  9.90s/it][A
wildfire:  40%|████      | 2/5 [00:20<00:29,  9.99s/it][A
wildfire:  60%|██████    | 3/5 [00:29<00:19,  9.77s/it][A
wildfire:  80%|████████  | 4/5 [00:38<00:09,  9.63s/it][A
wildfire: 100%|██████████| 5/5 [00:48<00:00,  9.74s/it][A
Events:  11%|█         | 1/9 [00:48<06:30, 48.80s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [00:11<01:50, 11.08s/it][A
earthquake:  18%|█▊        | 2/11 [00:22<01:40, 11.15s/it][A
earthquake:  27%|██▋       | 3/11 [00:33<01:29, 11.25s/it][A
earthquake:  36%|███▋      | 4/11 [00:45<01:18, 11.25s/it][A
earthquake:  45%|████▌     | 5/11 [00:54<01:04, 10.76s/it][A
earthquake:  55%|█████▍    | 6/11 [01:06<00:54, 10.98s/it][A
earthquake:  64%|██████▎   | 7/11 [01:17<00:43, 11.00s/it][A
earthquake:  73%|███████▎  | 8/11 [01:27<00:32, 10.88s/it][A
eart

In [10]:
#Store prio scores in readable format
labels = ['Low', 'Medium', 'High', 'Critical']

prioEnsScoreDf = pd.DataFrame()
for event in eventTypes:
    row = pd.Series(
        {
            'ensScores': ensPrioScores[event][0:1],
            'ensLabelScores': ensPrioScores[event][2:3],
            'avgAccEns': np.mean(ensPrioScores[event][0]),
            'avgF1Ens': np.mean(ensPrioScores[event][1]),
            'semAccEns': stats.sem(ensPrioScores[event][0]),
            'semF1Ens': stats.sem(ensPrioScores[event][1])
        }, name=event)
    #Add label specific columns
    for label in labels:
        if label in label in ensPrioScores[event][2]:
            labelCol = pd.Series(
                {
                    'avgAccEns' + label: np.mean(ensPrioScores[event][2][label]),
                    'avgF1Ens' + label: np.mean(ensPrioScores[event][3][label]),
                    'semAccEns' + label: stats.sem(ensPrioScores[event][2][label]),
                    'semF1Ens' + label: stats.sem(ensPrioScores[event][3][label])
                })
            row = row.append(labelCol)
    row.name = event
    prioEnsScoreDf = prioEnsScoreDf.append(row)
    
#Reorder for easy readability
cols = ['ensScores', 'ensLabelScores']
scoreTypes = ['F1', 'Acc']
accumTypes = ['avg', 'sem']
testTypes = ['Ens']

for score in scoreTypes:
    for accum in accumTypes:
        for test in testTypes:
            cols.append(accum+score+test)
            for label in labels:
                cols.append(accum+score+test+label)

print(cols)
prioEnsScoreDf = prioEnsScoreDf[cols]
prioEnsScoreDf

['ensScores', 'ensLabelScores', 'avgF1Ens', 'avgF1EnsLow', 'avgF1EnsMedium', 'avgF1EnsHigh', 'avgF1EnsCritical', 'semF1Ens', 'semF1EnsLow', 'semF1EnsMedium', 'semF1EnsHigh', 'semF1EnsCritical', 'avgAccEns', 'avgAccEnsLow', 'avgAccEnsMedium', 'avgAccEnsHigh', 'avgAccEnsCritical', 'semAccEns', 'semAccEnsLow', 'semAccEnsMedium', 'semAccEnsHigh', 'semAccEnsCritical']


Unnamed: 0,ensScores,ensLabelScores,avgF1Ens,avgF1EnsLow,avgF1EnsMedium,avgF1EnsHigh,avgF1EnsCritical,semF1Ens,semF1EnsLow,semF1EnsMedium,...,avgAccEns,avgAccEnsLow,avgAccEnsMedium,avgAccEnsHigh,avgAccEnsCritical,semAccEns,semAccEnsLow,semAccEnsMedium,semAccEnsHigh,semAccEnsCritical
wildfire,"[[0.3688212927756654, 0.361890694239291, 0.583...","[{'Low': [0.4657534246575342, 0.41698841698841...",0.202251,0.153688,0.092522,0.174069,0.020915,0.015307,0.014953,0.027397,...,0.443695,0.505617,0.242259,0.425467,0.044643,0.040028,0.053035,0.084683,0.057333,0.02247
earthquake,"[[0.5182186234817814, 0.43506493506493504, 0.3...","[{'Low': [0.5658536585365853, 0.69565217391304...",0.230435,0.172174,0.149301,0.079489,0.017565,0.019924,0.009056,0.044878,...,0.468017,0.536775,0.330394,0.197041,0.033438,0.027626,0.042061,0.084269,0.046104,0.017416
flood,"[[0.4425531914893617, 0.40274599542334094, 0.4...","[{'Low': [0.4766355140186916, 0.53979238754325...",0.215031,0.143717,0.102911,0.132326,0.070643,0.022735,0.008549,0.021419,...,0.428595,0.479972,0.269458,0.36894,0.123637,0.026688,0.032778,0.059105,0.065021,0.043699
typhoon,"[[0.5368852459016393, 0.4308510638297872, 0.42...","[{'Medium': [0.0410958904109589, 0.12558139534...",0.248919,0.183633,0.099873,0.094797,0.060591,0.01698,0.009066,0.018183,...,0.478765,0.595937,0.241202,0.231571,0.128453,0.026825,0.045358,0.050312,0.046748,0.038132
shooting,"[[0.37037037037037035, 0.38048343777976723, 0....","[{'Low': [0.4727272727272727, 0.34799482535575...",0.227856,0.142436,0.219044,0.062413,0.060135,0.030391,0.010477,0.088474,...,0.383667,0.409552,0.438659,0.145323,0.137591,0.034332,0.042553,0.084873,0.031787,0.044666
bombing,"[[0.483695652173913, 0.5121495327102804, 0.336...","[{'Low': [0.6181818181818182, 0.58947368421052...",0.272539,0.167678,0.117551,0.121924,0.072222,0.045026,0.020606,0.015604,...,0.444028,0.515559,0.311739,0.333228,0.148148,0.054516,0.088657,0.053635,0.08642,0.097991
covid,"[[0.4195501730103806, 0.24949939927913498, 0.5...","[{'Low': [0.44017563117453345, 0.2347753044939...",0.19092,0.132536,0.182156,0.057395,0.002717,0.022647,0.011016,0.010244,...,0.359603,0.372216,0.477145,0.121939,0.005556,0.033822,0.042899,0.02296,0.028266,0.005556
explosion,"[[0.5399293286219081, 0.5771543086172345]]","[{'Low': [0.5689922480620155, 0.65365853658536...",0.202261,0.151586,0.081275,0.0786,0.022727,0.020224,0.006526,2.5e-05,...,0.558542,0.611325,0.255002,0.21131,0.05,0.018612,0.042333,0.0001,0.002976,0.05
storm,"[[0.4495548961424332, 0.41]]","[{'Medium': [0.43952802359882004, 0.4792899408...",0.361336,0.141995,0.157332,0.154236,0.079845,0.006135,0.023751,0.004668,...,0.429777,0.402801,0.459409,0.446081,0.191209,0.019777,0.093067,0.019881,0.003373,0.037363


In [11]:
prioScoreDf = pd.read_json("Trec_data/prioScoreDF.json")
prioScoreDf

Unnamed: 0,genScores,specScores,genLabelScores,specLabelScores,avgF1Gen,avgF1GenLow,avgF1GenMedium,avgF1GenHigh,avgF1GenCritical,avgF1Spec,...,semAccGen,semAccGenLow,semAccGenMedium,semAccGenHigh,semAccGenCritical,semAccSpec,semAccSpecLow,semAccSpecMedium,semAccSpecHigh,semAccSpecCritical
wildfire,"[[0.3155893536, 0.3855243722, 0.5748598879, 0....","[[0.319391635, 0.3633677991, 0.5364291433, 0.5...","[{'Low': [0.3835616438, 0.42471042470000003, 0...","[{'Low': [0.40410958900000005, 0.4324324324, 0...",0.202101,0.1416,0.130968,0.106043,0.127141,0.187077,...,0.05021,0.065923,0.104937,0.084171,0.14989,0.054762,0.079378,0.056346,0.064275,0.29068
earthquake,"[[0.4251012146, 0.3961038961, 0.3495145631, 0....","[[0.3076923077, 0.4025974026, 0.2330097087, 0....","[{'Low': [0.4390243902, 0.6304347826000001, 0....","[{'Low': [0.31219512200000005, 0.5652173913, 0...",0.19879,0.155774,0.136921,0.13299,0.006173,0.185111,...,0.030326,0.04304,0.060163,0.039883,0.01,0.037698,0.059038,0.07196,0.066753,0.025852
flood,"[[0.4425531915, 0.38215102970000003, 0.4695290...","[[0.4553191489, 0.5102974828, 0.55540166200000...","[{'Low': [0.46728971960000004, 0.4982698962, 0...","[{'Medium': [0.2753623188, 0.0, 0.1025641026, ...",0.222481,0.148776,0.115905,0.111624,0.13142,0.264798,...,0.02539,0.031716,0.055928,0.053553,0.106968,0.027955,0.029163,0.064864,0.037964,0.07769
typhoon,"[[0.4221311475, 0.4397163121, 0.46875000000000...","[[0.49590163930000003, 0.384751773, 0.41666666...","[{'Low': [0.6058394161, 0.5637583893, 0.666666...","[{'Medium': [0.0821917808, 0.0511627907, 0.105...",0.217954,0.145295,0.141466,0.104312,0.012844,0.251374,...,0.02831,0.04057,0.057983,0.033915,0.01751,0.023641,0.038407,0.058892,0.057699,0.031805
shooting,"[[0.3765432099, 0.3518352731, 0.6669335468, 0....","[[0.3888888889, 0.4010743062, 0.615692554, 0.2...","[{'Low': [0.3818181818, 0.3285899094, 0.696207...","[{'Low': [0.4909090909, 0.3893919793, 0.637835...",0.252764,0.161721,0.220437,0.091062,0.064464,0.231618,...,0.050532,0.059765,0.086671,0.05874,0.03832,0.045154,0.050021,0.073222,0.035149,0.049598
bombing,"[[0.4565217391, 0.5457943925000001, 0.42829457...","[[0.3206521739, 0.4485981308, 0.4646317829]]","[{'Low': [0.6000000000000001, 0.6263157895, 0....","[{'Medium': [0.0769230769, 0.2558139535, 0.164...",0.2864,0.177861,0.130975,0.110299,0.083586,0.228205,...,0.035412,0.057187,0.064239,0.055244,0.08114,0.045557,0.012782,0.051644,0.121363,0.065385
covid,"[[0.4074394464, 0.2843412095, 0.5410286612, 0....","[[0.5308535179, 0.3219863837, 0.58029053790000...","[{'Low': [0.422978412, 0.2717345653, 0.5803876...","[{'Medium': [0.368869936, 0.4757281553, 0.2571...",0.157048,0.106401,0.143802,0.072371,0.005319,0.213951,...,0.031436,0.038061,0.025847,0.032613,0.011111,0.027976,0.041091,0.034505,0.028679,0.0
explosion,"[[0.2706713781, 0.4649298597]]","[[0.15689045940000002, 0.6913827655]]","[{'Low': [0.2542635659, 0.5024390244], 'Medium...","[{'Low': [0.1387596899, 0.8048780488], 'Medium...",0.154832,0.124148,0.137005,0.104779,0.030303,0.201402,...,0.097129,0.124088,0.023709,0.136905,0.05,0.267246,0.333059,0.02521,0.255952,0.116667
storm,"[[0.4347181009, 0.386]]","[[0.3419881306, 0.35000000000000003]]","[{'Low': [0.4669421488, 0.2831858407], 'Medium...","[{'High': [0.3702185792, 0.5520833333], 'Criti...",0.32844,0.13475,0.155279,0.150199,0.055771,0.275047,...,0.024359,0.091878,0.011005,0.002305,0.095055,0.004006,0.1217,0.015011,0.090932,0.000549


In [12]:
prioBothScoreDf = pd.concat([prioScoreDf, prioEnsScoreDf], axis=1)
prioBothScoreDf

Unnamed: 0,genScores,specScores,genLabelScores,specLabelScores,avgF1Gen,avgF1GenLow,avgF1GenMedium,avgF1GenHigh,avgF1GenCritical,avgF1Spec,...,avgAccEns,avgAccEnsLow,avgAccEnsMedium,avgAccEnsHigh,avgAccEnsCritical,semAccEns,semAccEnsLow,semAccEnsMedium,semAccEnsHigh,semAccEnsCritical
wildfire,"[[0.3155893536, 0.3855243722, 0.5748598879, 0....","[[0.319391635, 0.3633677991, 0.5364291433, 0.5...","[{'Low': [0.3835616438, 0.42471042470000003, 0...","[{'Low': [0.40410958900000005, 0.4324324324, 0...",0.202101,0.1416,0.130968,0.106043,0.127141,0.187077,...,0.443695,0.505617,0.242259,0.425467,0.044643,0.040028,0.053035,0.084683,0.057333,0.02247
earthquake,"[[0.4251012146, 0.3961038961, 0.3495145631, 0....","[[0.3076923077, 0.4025974026, 0.2330097087, 0....","[{'Low': [0.4390243902, 0.6304347826000001, 0....","[{'Low': [0.31219512200000005, 0.5652173913, 0...",0.19879,0.155774,0.136921,0.13299,0.006173,0.185111,...,0.468017,0.536775,0.330394,0.197041,0.033438,0.027626,0.042061,0.084269,0.046104,0.017416
flood,"[[0.4425531915, 0.38215102970000003, 0.4695290...","[[0.4553191489, 0.5102974828, 0.55540166200000...","[{'Low': [0.46728971960000004, 0.4982698962, 0...","[{'Medium': [0.2753623188, 0.0, 0.1025641026, ...",0.222481,0.148776,0.115905,0.111624,0.13142,0.264798,...,0.428595,0.479972,0.269458,0.36894,0.123637,0.026688,0.032778,0.059105,0.065021,0.043699
typhoon,"[[0.4221311475, 0.4397163121, 0.46875000000000...","[[0.49590163930000003, 0.384751773, 0.41666666...","[{'Low': [0.6058394161, 0.5637583893, 0.666666...","[{'Medium': [0.0821917808, 0.0511627907, 0.105...",0.217954,0.145295,0.141466,0.104312,0.012844,0.251374,...,0.478765,0.595937,0.241202,0.231571,0.128453,0.026825,0.045358,0.050312,0.046748,0.038132
shooting,"[[0.3765432099, 0.3518352731, 0.6669335468, 0....","[[0.3888888889, 0.4010743062, 0.615692554, 0.2...","[{'Low': [0.3818181818, 0.3285899094, 0.696207...","[{'Low': [0.4909090909, 0.3893919793, 0.637835...",0.252764,0.161721,0.220437,0.091062,0.064464,0.231618,...,0.383667,0.409552,0.438659,0.145323,0.137591,0.034332,0.042553,0.084873,0.031787,0.044666
bombing,"[[0.4565217391, 0.5457943925000001, 0.42829457...","[[0.3206521739, 0.4485981308, 0.4646317829]]","[{'Low': [0.6000000000000001, 0.6263157895, 0....","[{'Medium': [0.0769230769, 0.2558139535, 0.164...",0.2864,0.177861,0.130975,0.110299,0.083586,0.228205,...,0.444028,0.515559,0.311739,0.333228,0.148148,0.054516,0.088657,0.053635,0.08642,0.097991
covid,"[[0.4074394464, 0.2843412095, 0.5410286612, 0....","[[0.5308535179, 0.3219863837, 0.58029053790000...","[{'Low': [0.422978412, 0.2717345653, 0.5803876...","[{'Medium': [0.368869936, 0.4757281553, 0.2571...",0.157048,0.106401,0.143802,0.072371,0.005319,0.213951,...,0.359603,0.372216,0.477145,0.121939,0.005556,0.033822,0.042899,0.02296,0.028266,0.005556
explosion,"[[0.2706713781, 0.4649298597]]","[[0.15689045940000002, 0.6913827655]]","[{'Low': [0.2542635659, 0.5024390244], 'Medium...","[{'Low': [0.1387596899, 0.8048780488], 'Medium...",0.154832,0.124148,0.137005,0.104779,0.030303,0.201402,...,0.558542,0.611325,0.255002,0.21131,0.05,0.018612,0.042333,0.0001,0.002976,0.05
storm,"[[0.4347181009, 0.386]]","[[0.3419881306, 0.35000000000000003]]","[{'Low': [0.4669421488, 0.2831858407], 'Medium...","[{'High': [0.3702185792, 0.5520833333], 'Criti...",0.32844,0.13475,0.155279,0.150199,0.055771,0.275047,...,0.429777,0.402801,0.459409,0.446081,0.191209,0.019777,0.093067,0.019881,0.003373,0.037363


In [13]:
#Save prio scores
prioBothScoreDf.to_json("Trec_data/prioScoreDF.json")

filename = 'Trec_data/prio_results.pkl'
outfile = open(filename,'wb')
pickle.dump(prioBothScoreDf, outfile)
outfile.close()

In [14]:
prioBothScoreDf = pd.read_json("Trec_data/prioScoreDF.json")
prioBothScoreDf

Unnamed: 0,genScores,specScores,genLabelScores,specLabelScores,avgF1Gen,avgF1GenLow,avgF1GenMedium,avgF1GenHigh,avgF1GenCritical,avgF1Spec,...,avgAccEns,avgAccEnsLow,avgAccEnsMedium,avgAccEnsHigh,avgAccEnsCritical,semAccEns,semAccEnsLow,semAccEnsMedium,semAccEnsHigh,semAccEnsCritical
wildfire,"[[0.3155893536, 0.3855243722, 0.5748598879, 0....","[[0.319391635, 0.3633677991, 0.5364291433, 0.5...","[{'Low': [0.3835616438, 0.42471042470000003, 0...","[{'Low': [0.40410958900000005, 0.4324324324, 0...",0.202101,0.1416,0.130968,0.106043,0.127141,0.187077,...,0.443695,0.505617,0.242259,0.425467,0.044643,0.040028,0.053035,0.084683,0.057333,0.02247
earthquake,"[[0.4251012146, 0.3961038961, 0.3495145631, 0....","[[0.3076923077, 0.4025974026, 0.2330097087, 0....","[{'Low': [0.4390243902, 0.6304347826000001, 0....","[{'Low': [0.31219512200000005, 0.5652173913, 0...",0.19879,0.155774,0.136921,0.13299,0.006173,0.185111,...,0.468017,0.536775,0.330394,0.197041,0.033438,0.027626,0.042061,0.084269,0.046104,0.017416
flood,"[[0.4425531915, 0.38215102970000003, 0.4695290...","[[0.4553191489, 0.5102974828, 0.55540166200000...","[{'Low': [0.46728971960000004, 0.4982698962, 0...","[{'Medium': [0.2753623188, 0.0, 0.1025641026, ...",0.222481,0.148776,0.115905,0.111624,0.13142,0.264798,...,0.428595,0.479972,0.269458,0.36894,0.123637,0.026688,0.032778,0.059105,0.065021,0.043699
typhoon,"[[0.4221311475, 0.4397163121, 0.46875000000000...","[[0.49590163930000003, 0.384751773, 0.41666666...","[{'Low': [0.6058394161, 0.5637583893, 0.666666...","[{'Medium': [0.0821917808, 0.0511627907, 0.105...",0.217954,0.145295,0.141466,0.104312,0.012844,0.251374,...,0.478765,0.595937,0.241202,0.231571,0.128453,0.026825,0.045358,0.050312,0.046748,0.038132
shooting,"[[0.3765432099, 0.3518352731, 0.6669335468, 0....","[[0.3888888889, 0.4010743062, 0.615692554, 0.2...","[{'Low': [0.3818181818, 0.3285899094, 0.696207...","[{'Low': [0.4909090909, 0.3893919793, 0.637835...",0.252764,0.161721,0.220437,0.091062,0.064464,0.231618,...,0.383667,0.409552,0.438659,0.145323,0.137591,0.034332,0.042553,0.084873,0.031787,0.044666
bombing,"[[0.4565217391, 0.5457943925000001, 0.42829457...","[[0.3206521739, 0.4485981308, 0.4646317829]]","[{'Low': [0.6000000000000001, 0.6263157895, 0....","[{'Medium': [0.0769230769, 0.2558139535, 0.164...",0.2864,0.177861,0.130975,0.110299,0.083586,0.228205,...,0.444028,0.515559,0.311739,0.333228,0.148148,0.054516,0.088657,0.053635,0.08642,0.097991
covid,"[[0.4074394464, 0.2843412095, 0.5410286612, 0....","[[0.5308535179, 0.3219863837, 0.58029053790000...","[{'Low': [0.422978412, 0.2717345653, 0.5803876...","[{'Medium': [0.368869936, 0.4757281553, 0.2571...",0.157048,0.106401,0.143802,0.072371,0.005319,0.213951,...,0.359603,0.372216,0.477145,0.121939,0.005556,0.033822,0.042899,0.02296,0.028266,0.005556
explosion,"[[0.2706713781, 0.4649298597]]","[[0.15689045940000002, 0.6913827655]]","[{'Low': [0.2542635659, 0.5024390244], 'Medium...","[{'Low': [0.1387596899, 0.8048780488], 'Medium...",0.154832,0.124148,0.137005,0.104779,0.030303,0.201402,...,0.558542,0.611325,0.255002,0.21131,0.05,0.018612,0.042333,0.0001,0.002976,0.05
storm,"[[0.4347181009, 0.386]]","[[0.3419881306, 0.35000000000000003]]","[{'Low': [0.4669421488, 0.2831858407], 'Medium...","[{'High': [0.3702185792, 0.5520833333], 'Criti...",0.32844,0.13475,0.155279,0.150199,0.055771,0.275047,...,0.429777,0.402801,0.459409,0.446081,0.191209,0.019777,0.093067,0.019881,0.003373,0.037363


# **`Generate and Test postCategories Models`**

In [15]:
def generate_scores_by_event_Cat(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        #y_train = training[target]        
        X_test = test[features]
        #y_test = test[target]
        
        y_train = []
        for val in training[target]:
            y_train.append(np.array(val))
        y_train= np.array(y_train)
            
        y_test = []
        for val in test[target]:
            y_test.append(np.array(val))
        y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
    return [accuracy_accum, f1_accum] #Accuracy is 0, F1 is 1

In [16]:
catLabel = 'sparseCategories'
catModel = MultiOutputClassifier(RandomForestClassifier(**rf_params))

#genCatScores = {}
ensCatScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType'].isin(EventEnsembles[event])]
    #genCatScores[event] = generate_scores_by_event_Cat(df, event, features, catLabel, catModel)
    ensCatScores[event] = generate_scores_by_event_Cat(eventDF, event, features, catLabel, catModel)


Events:   0%|          | 0/9 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [03:25<13:40, 205.21s/it][A
wildfire:  40%|████      | 2/5 [06:46<10:12, 204.15s/it][A
wildfire:  60%|██████    | 3/5 [09:50<06:35, 197.95s/it][A
wildfire:  80%|████████  | 4/5 [12:58<03:15, 195.15s/it][A
wildfire: 100%|██████████| 5/5 [16:21<00:00, 196.38s/it][A
Events:  11%|█         | 1/9 [16:21<2:10:55, 981.97s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [04:06<41:00, 246.01s/it][A
earthquake:  18%|█▊        | 2/11 [08:13<36:58, 246.48s/it][A
earthquake:  27%|██▋       | 3/11 [12:22<32:57, 247.24s/it][A
earthquake:  36%|███▋      | 4/11 [16:27<28:46, 246.62s/it][A
earthquake:  45%|████▌     | 5/11 [19:46<23:12, 232.12s/it][A
earthquake:  55%|█████▍    | 6/11 [23:53<19:43, 236.72s/it][A
earthquake:  64%|██████▎   | 7/11 [27:56<15:54, 238.73s/it][A
earthquake:  73%|███████▎  | 8/11 [31:46<11:48, 236

In [17]:
#Store cat scores in readable format
catEnsScoreDf = pd.DataFrame(columns=['ensScores',
                                   'avgAccEns', 'avgF1Ens',
                                   'semAccEns', 'semF1Ens'])
for event in eventTypes:
    row = pd.Series(
        {
            'ensScores': ensCatScores[event],
            'avgAccEns': np.mean(ensCatScores[event][0]),
            'avgF1Ens': np.mean(ensCatScores[event][1]),
            'semAccEns': stats.sem(ensCatScores[event][0]),
            'semF1Ens': stats.sem(ensCatScores[event][1])
        }, name=event)
    catEnsScoreDf = catEnsScoreDf.append(row)
    
catEnsScoreDf

Unnamed: 0,ensScores,avgAccEns,avgF1Ens,semAccEns,semF1Ens
wildfire,"[[0.0038022813688212928, 0.0029542097488921715...",0.009665,0.162375,0.006573,0.022227
earthquake,"[[0.008097165991902834, 0.006493506493506494, ...",0.013568,0.128629,0.003654,0.012552
flood,"[[0.0, 0.0, 0.0013850415512465374, 0.0, 0.0168...",0.01498,0.147314,0.006477,0.019037
typhoon,"[[0.0, 0.012411347517730497, 0.020833333333333...",0.017404,0.147333,0.003417,0.020923
shooting,"[[0.006172839506172839, 0.0035810205908683975,...",0.049364,0.136698,0.01189,0.016898
bombing,"[[0.010869565217391304, 0.014953271028037384, ...",0.010061,0.134627,0.003084,0.009953
covid,"[[0.022779700115340255, 0.016019223067681217, ...",0.013331,0.150677,0.002981,0.016965
explosion,"[[0.014840989399293287, 0.006012024048096192],...",0.010427,0.231288,0.004414,0.006194
storm,"[[0.014094955489614243, 0.0], [0.2721476073552...",0.007047,0.248874,0.007047,0.023274


In [18]:
catScoreDf = pd.read_json("Trec_data/catScoreDF.json")
catScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,semAccGen,semAccSpec,semF1Gen,semF1Spec
wildfire,"[[0.0038022814, 0.0029542097, 0.0060048038, 0....","[[0.0, 0.0029542097, 0.0028022418000000003, 0....",0.013614,0.002852,0.15888,0.140397,0.007799,0.001553,0.02328,0.01892
earthquake,"[[0.0, 0.012987013, 0.019417475700000002, 0.00...","[[0.0, 0.0064935065, 0.0, 0.012861736300000001...",0.017292,0.008435,0.127264,0.126537,0.004763,0.002904,0.014041,0.014521
flood,"[[0.0170212766, 0.0, 0.0013850416, 0.00243309,...","[[0.0042553191, 0.0, 0.0013850416, 0.0, 0.0126...",0.022876,0.020919,0.143337,0.142961,0.008987,0.009397,0.019121,0.018898
typhoon,"[[0.0040983607, 0.0212765957, 0.03125, 0.01066...","[[0.0, 0.0177304965, 0.0208333333, 0.012696800...",0.016698,0.015797,0.146908,0.140081,0.002859,0.003128,0.02008,0.020394
shooting,"[[0.0, 0.0053715309, 0.040832666100000005, 0.0...","[[0.0061728395, 0.0044762757, 0.0760608487, 0....",0.034712,0.110057,0.145381,0.138891,0.008621,0.036107,0.01854,0.01721
bombing,"[[0.0217391304, 0.0074766355, 0.0208333333], [...","[[0.0, 0.0317757009, 0.0], [0.1100342447, 0.14...",0.016683,0.010592,0.143931,0.122796,0.004611,0.010592,0.012781,0.010933
covid,"[[0.0210495963, 0.013215859000000002, 0.016489...","[[0.0320069204, 0.018021626000000002, 0.038869...",0.015181,0.022111,0.147563,0.150193,0.00312,0.004062,0.016414,0.017765
explosion,"[[0.0148409894, 0.0040080160000000005], [0.225...","[[0.014134275600000001, 0.0180360721], [0.2362...",0.009425,0.016085,0.21877,0.236504,0.005416,0.001951,0.007173,0.000221
storm,"[[0.0140949555, 0.004], [0.2660248778, 0.22146...","[[0.0059347181, 0.008], [0.1981539273, 0.23248...",0.009047,0.006967,0.243746,0.215318,0.005047,0.001033,0.022279,0.017164


In [19]:
#Combine the new data into the old data
catBothScoreDf = pd.concat([catScoreDf, catEnsScoreDf], axis=1)
catBothScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,semAccGen,semAccSpec,semF1Gen,semF1Spec,ensScores,avgAccEns,avgF1Ens,semAccEns,semF1Ens
wildfire,"[[0.0038022814, 0.0029542097, 0.0060048038, 0....","[[0.0, 0.0029542097, 0.0028022418000000003, 0....",0.013614,0.002852,0.15888,0.140397,0.007799,0.001553,0.02328,0.01892,"[[0.0038022813688212928, 0.0029542097488921715...",0.009665,0.162375,0.006573,0.022227
earthquake,"[[0.0, 0.012987013, 0.019417475700000002, 0.00...","[[0.0, 0.0064935065, 0.0, 0.012861736300000001...",0.017292,0.008435,0.127264,0.126537,0.004763,0.002904,0.014041,0.014521,"[[0.008097165991902834, 0.006493506493506494, ...",0.013568,0.128629,0.003654,0.012552
flood,"[[0.0170212766, 0.0, 0.0013850416, 0.00243309,...","[[0.0042553191, 0.0, 0.0013850416, 0.0, 0.0126...",0.022876,0.020919,0.143337,0.142961,0.008987,0.009397,0.019121,0.018898,"[[0.0, 0.0, 0.0013850415512465374, 0.0, 0.0168...",0.01498,0.147314,0.006477,0.019037
typhoon,"[[0.0040983607, 0.0212765957, 0.03125, 0.01066...","[[0.0, 0.0177304965, 0.0208333333, 0.012696800...",0.016698,0.015797,0.146908,0.140081,0.002859,0.003128,0.02008,0.020394,"[[0.0, 0.012411347517730497, 0.020833333333333...",0.017404,0.147333,0.003417,0.020923
shooting,"[[0.0, 0.0053715309, 0.040832666100000005, 0.0...","[[0.0061728395, 0.0044762757, 0.0760608487, 0....",0.034712,0.110057,0.145381,0.138891,0.008621,0.036107,0.01854,0.01721,"[[0.006172839506172839, 0.0035810205908683975,...",0.049364,0.136698,0.01189,0.016898
bombing,"[[0.0217391304, 0.0074766355, 0.0208333333], [...","[[0.0, 0.0317757009, 0.0], [0.1100342447, 0.14...",0.016683,0.010592,0.143931,0.122796,0.004611,0.010592,0.012781,0.010933,"[[0.010869565217391304, 0.014953271028037384, ...",0.010061,0.134627,0.003084,0.009953
covid,"[[0.0210495963, 0.013215859000000002, 0.016489...","[[0.0320069204, 0.018021626000000002, 0.038869...",0.015181,0.022111,0.147563,0.150193,0.00312,0.004062,0.016414,0.017765,"[[0.022779700115340255, 0.016019223067681217, ...",0.013331,0.150677,0.002981,0.016965
explosion,"[[0.0148409894, 0.0040080160000000005], [0.225...","[[0.014134275600000001, 0.0180360721], [0.2362...",0.009425,0.016085,0.21877,0.236504,0.005416,0.001951,0.007173,0.000221,"[[0.014840989399293287, 0.006012024048096192],...",0.010427,0.231288,0.004414,0.006194
storm,"[[0.0140949555, 0.004], [0.2660248778, 0.22146...","[[0.0059347181, 0.008], [0.1981539273, 0.23248...",0.009047,0.006967,0.243746,0.215318,0.005047,0.001033,0.022279,0.017164,"[[0.014094955489614243, 0.0], [0.2721476073552...",0.007047,0.248874,0.007047,0.023274


In [20]:
#Save cat scores
catBothScoreDf.to_json("Trec_data/catScoreDF.json")

filename = 'Trec_data/cat_results.pkl'
outfile = open(filename,'wb')
pickle.dump(catBothScoreDf,outfile)
outfile.close()

In [21]:
catBothScoreDf = pd.read_json("Trec_data/catScoreDF.json")
catBothScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,semAccGen,semAccSpec,semF1Gen,semF1Spec,ensScores,avgAccEns,avgF1Ens,semAccEns,semF1Ens
wildfire,"[[0.0038022814, 0.0029542097, 0.0060048038, 0....","[[0.0, 0.0029542097, 0.0028022418000000003, 0....",0.013614,0.002852,0.15888,0.140397,0.007799,0.001553,0.02328,0.01892,"[[0.0038022814, 0.0029542097, 0.0012009608, 0....",0.009665,0.162375,0.006573,0.022227
earthquake,"[[0.0, 0.012987013, 0.019417475700000002, 0.00...","[[0.0, 0.0064935065, 0.0, 0.012861736300000001...",0.017292,0.008435,0.127264,0.126537,0.004763,0.002904,0.014041,0.014521,"[[0.008097166000000001, 0.0064935065, 0.0, 0.0...",0.013568,0.128629,0.003654,0.012552
flood,"[[0.0170212766, 0.0, 0.0013850416, 0.00243309,...","[[0.0042553191, 0.0, 0.0013850416, 0.0, 0.0126...",0.022876,0.020919,0.143337,0.142961,0.008987,0.009397,0.019121,0.018898,"[[0.0, 0.0, 0.0013850416, 0.0, 0.0168302945000...",0.01498,0.147314,0.006477,0.019037
typhoon,"[[0.0040983607, 0.0212765957, 0.03125, 0.01066...","[[0.0, 0.0177304965, 0.0208333333, 0.012696800...",0.016698,0.015797,0.146908,0.140081,0.002859,0.003128,0.02008,0.020394,"[[0.0, 0.012411347500000001, 0.0208333333, 0.0...",0.017404,0.147333,0.003417,0.020923
shooting,"[[0.0, 0.0053715309, 0.040832666100000005, 0.0...","[[0.0061728395, 0.0044762757, 0.0760608487, 0....",0.034712,0.110057,0.145381,0.138891,0.008621,0.036107,0.01854,0.01721,"[[0.0061728395, 0.0035810206, 0.0728582866, 0....",0.049364,0.136698,0.01189,0.016898
bombing,"[[0.0217391304, 0.0074766355, 0.0208333333], [...","[[0.0, 0.0317757009, 0.0], [0.1100342447, 0.14...",0.016683,0.010592,0.143931,0.122796,0.004611,0.010592,0.012781,0.010933,"[[0.0108695652, 0.014953271, 0.004360465100000...",0.010061,0.134627,0.003084,0.009953
covid,"[[0.0210495963, 0.013215859000000002, 0.016489...","[[0.0320069204, 0.018021626000000002, 0.038869...",0.015181,0.022111,0.147563,0.150193,0.00312,0.004062,0.016414,0.017765,"[[0.0227797001, 0.0160192231, 0.0227718885, 0....",0.013331,0.150677,0.002981,0.016965
explosion,"[[0.0148409894, 0.0040080160000000005], [0.225...","[[0.014134275600000001, 0.0180360721], [0.2362...",0.009425,0.016085,0.21877,0.236504,0.005416,0.001951,0.007173,0.000221,"[[0.0148409894, 0.006012024], [0.2374823763000...",0.010427,0.231288,0.004414,0.006194
storm,"[[0.0140949555, 0.004], [0.2660248778, 0.22146...","[[0.0059347181, 0.008], [0.1981539273, 0.23248...",0.009047,0.006967,0.243746,0.215318,0.005047,0.001033,0.022279,0.017164,"[[0.0140949555, 0.0], [0.2721476074, 0.2255999...",0.007047,0.248874,0.007047,0.023274
