In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import clone
from scipy import stats
import sklearn.preprocessing
import pickle
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random



In [2]:
#Dataframe  generated in collect-tweets.ipynb
df = pd.read_json("Trec_data/Preprocessed_labelled.json", orient='records', lines=True)
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos,regression_priority,sparseCategories
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,28,0,29,-0.3400,0.091,0.909,0.000,0.25,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,10,11,1,11,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,25,0,28,0.0000,0.000,1.000,0.000,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,26,29,0,32,0.0552,0.177,0.610,0.214,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,23,25,0,26,0.0000,0.000,1.000,0.000,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,24,21,1,24,-0.3412,0.099,0.901,0.000,0.25,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,16,16,1,18,-0.0516,0.173,0.663,0.163,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,10,12,1,13,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,12,13,1,13,-0.8271,0.490,0.510,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ..."


# **`Generate Event Types`**

In [3]:
fullEventTypes = df['eventType'].unique()
eventTypes = []
for event in fullEventTypes:
    events = df.loc[df['eventType'] == event]['eventID'].unique()
    if events.size > 1:
        eventTypes.append(event)
print(eventTypes)

['wildfire', 'earthquake', 'flood', 'typhoon', 'shooting', 'bombing', 'covid', 'explosion', 'storm']


# **`Model Related Methods`**

In [4]:
def train_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [5]:
#Currently unused
def save_model(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename):
    model = pickle.load(open(filename, 'rb'))
    return model

# **`Generate Generic Variables`**

In [6]:
features = ["num_chars", "num_chars_total", 
            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
            "vader neu", "vader compound", 
            "num_hashtags", "num_mentions", 
            "num_urls", 
            "is_retweet", "num_media",
            "is_verified", 
            "caps_ratio"]

#I think you need to make a list of lists

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
    'verbose': 0
}

# **`Generate and Test postPriority Models`**

In [11]:
def generate_scores_by_event_Prio(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    labels = data[target].unique()
    label_f1_accum = {} #Dict by unique labels
    label_score_accum = {}
    for label in labels:
        label_f1_accum[label] = []
        label_score_accum[label] = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        y_train = training[target]        
        X_test = test[features]
        y_test = test[target]
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
        #Seperate scores per label
        for label in labels:
            label_ids = test[target]==label
            x_label = X_test[label_ids]
            y_label = y_test[label_ids]
            y_infer_label = y_infer_local[label_ids]
            
            if x_label.size == 0:
                continue
            
            label_f1 = f1_score(y_label, y_infer_label, average="macro", zero_division=0)
            label_score = model.score(x_label, y_label)
            
            label_f1_accum[label].append(label_f1)
            label_score_accum[label].append(label_score)
        
        
    for label in labels: #Prevent blank
        if len(label_f1_accum[label])==0:
            label_f1_accum.pop(label, None)
            label_score_accum.pop(label, None)
        
    return [accuracy_accum, f1_accum, label_score_accum, label_f1_accum] #Accuracy is 0, F1 is 1, label Acc is 2, label F1 is 3

In [12]:
prioLabel = 'postPriority'
prioModel = RandomForestClassifier(**rf_params) #(**modelParameters)

genPrioScores = {}
specPrioScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType']==event]
    genPrioScores[event] = generate_scores_by_event_Prio(df, event, features, prioLabel, prioModel)
    specPrioScores[event] = generate_scores_by_event_Prio(eventDF, event, features, prioLabel, prioModel)
    

Events:   0%|          | 0/9 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:32<02:10, 32.53s/it][A
wildfire:  40%|████      | 2/5 [01:05<01:37, 32.64s/it][A
wildfire:  60%|██████    | 3/5 [01:36<01:04, 32.09s/it][A
wildfire:  80%|████████  | 4/5 [02:07<00:31, 31.72s/it][A
wildfire: 100%|██████████| 5/5 [02:40<00:00, 32.11s/it][A

wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:01<00:06,  1.60s/it][A
wildfire:  40%|████      | 2/5 [00:03<00:04,  1.58s/it][A
wildfire:  60%|██████    | 3/5 [00:04<00:02,  1.45s/it][A
wildfire:  80%|████████  | 4/5 [00:05<00:01,  1.34s/it][A
wildfire: 100%|██████████| 5/5 [00:06<00:00,  1.38s/it][A
Events:  11%|█         | 1/9 [02:47<22:21, 167.65s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [00:32<05:25, 32.52s/it][A
earthquake:  18%|█▊        | 2/11 [01:06<04:55, 32.86s/it][A
earthquake:  27%|██▋       |

covid:  20%|██        | 2/10 [00:07<00:29,  3.67s/it][A
covid:  30%|███       | 3/10 [00:11<00:26,  3.74s/it][A
covid:  40%|████      | 4/10 [00:15<00:23,  3.94s/it][A
covid:  50%|█████     | 5/10 [00:20<00:20,  4.11s/it][A
covid:  60%|██████    | 6/10 [00:24<00:16,  4.21s/it][A
covid:  70%|███████   | 7/10 [00:29<00:12,  4.28s/it][A
covid:  80%|████████  | 8/10 [00:33<00:08,  4.32s/it][A
covid:  90%|█████████ | 9/10 [00:38<00:04,  4.35s/it][A
covid: 100%|██████████| 10/10 [00:42<00:00,  4.25s/it][A
Events:  78%|███████▊  | 7/9 [37:40<09:59, 299.53s/it]
explosion:   0%|          | 0/2 [00:00<?, ?it/s][A
explosion:  50%|█████     | 1/2 [00:32<00:32, 32.74s/it][A
explosion: 100%|██████████| 2/2 [01:06<00:00, 33.02s/it][A

explosion:   0%|          | 0/2 [00:00<?, ?it/s][A
explosion:  50%|█████     | 1/2 [00:00<00:00,  3.65it/s][A
explosion: 100%|██████████| 2/2 [00:00<00:00,  2.85it/s][A
Events:  89%|████████▉ | 8/9 [38:47<03:49, 229.77s/it]
storm:   0%|          | 0/2 [00

In [13]:
#Store prio scores in readable format
labels = ['Low', 'Medium', 'High', 'Critical']

prioScoreDf = pd.DataFrame()#columns=cols)
for event in eventTypes:
    row = pd.Series(
        {
            'genScores': genPrioScores[event][0:1], 'specScores': specPrioScores[event][0:1],
            'genLabelScores': genPrioScores[event][2:3], 'specLabelScores': specPrioScores[event][2:3],
            'avgAccGen': np.mean(genPrioScores[event][0]), 'avgAccSpec': np.mean(specPrioScores[event][0]),
            'avgF1Gen': np.mean(genPrioScores[event][1]), 'avgF1Spec': np.mean(specPrioScores[event][1]),
            'semAccGen': stats.sem(genPrioScores[event][0]), 'semAccSpec': stats.sem(specPrioScores[event][0]),
            'semF1Gen': stats.sem(genPrioScores[event][1]), 'semF1Spec': stats.sem(specPrioScores[event][1])
        }, name=event)
    #Add label specific columns
    for label in labels:
        if label in genPrioScores[event][2] and label in specPrioScores[event][2]:
            labelCol = pd.Series(
                {
                    'avgAccGen' + label: np.mean(genPrioScores[event][2][label]), 'avgAccSpec' + label: np.mean(specPrioScores[event][2][label]),
                    'avgF1Gen' + label: np.mean(genPrioScores[event][3][label]), 'avgF1Spec' + label: np.mean(specPrioScores[event][3][label]),
                    'semAccGen' + label: stats.sem(genPrioScores[event][2][label]), 'semAccSpec' + label: stats.sem(specPrioScores[event][2][label]),
                    'semF1Gen' + label: stats.sem(genPrioScores[event][3][label]), 'semF1Spec' + label: stats.sem(specPrioScores[event][3][label])
                })
            row = row.append(labelCol)
    row.name = event
    prioScoreDf = prioScoreDf.append(row)
    
#Reorder for easy readability
cols = ['genScores', 'specScores', 'genLabelScores', 'specLabelScores']
scoreTypes = ['F1', 'Acc']
accumTypes = ['avg', 'sem']
testTypes = ['Gen', 'Spec']

for score in scoreTypes:
    for accum in accumTypes:
        for test in testTypes:
            cols.append(accum+score+test)
            for label in labels:
                cols.append(accum+score+test+label)

print(cols)
prioScoreDf = prioScoreDf[cols]
prioScoreDf

['genScores', 'specScores', 'genLabelScores', 'specLabelScores', 'avgF1Gen', 'avgF1GenLow', 'avgF1GenMedium', 'avgF1GenHigh', 'avgF1GenCritical', 'avgF1Spec', 'avgF1SpecLow', 'avgF1SpecMedium', 'avgF1SpecHigh', 'avgF1SpecCritical', 'semF1Gen', 'semF1GenLow', 'semF1GenMedium', 'semF1GenHigh', 'semF1GenCritical', 'semF1Spec', 'semF1SpecLow', 'semF1SpecMedium', 'semF1SpecHigh', 'semF1SpecCritical', 'avgAccGen', 'avgAccGenLow', 'avgAccGenMedium', 'avgAccGenHigh', 'avgAccGenCritical', 'avgAccSpec', 'avgAccSpecLow', 'avgAccSpecMedium', 'avgAccSpecHigh', 'avgAccSpecCritical', 'semAccGen', 'semAccGenLow', 'semAccGenMedium', 'semAccGenHigh', 'semAccGenCritical', 'semAccSpec', 'semAccSpecLow', 'semAccSpecMedium', 'semAccSpecHigh', 'semAccSpecCritical']


Unnamed: 0,genScores,specScores,genLabelScores,specLabelScores,avgF1Gen,avgF1GenLow,avgF1GenMedium,avgF1GenHigh,avgF1GenCritical,avgF1Spec,...,semAccGen,semAccGenLow,semAccGenMedium,semAccGenHigh,semAccGenCritical,semAccSpec,semAccSpecLow,semAccSpecMedium,semAccSpecHigh,semAccSpecCritical
wildfire,"[[0.3155893536121673, 0.38552437223042835, 0.5...","[[0.3193916349809886, 0.36336779911373707, 0.5...","[{'Low': [0.3835616438356164, 0.42471042471042...","[{'Low': [0.4041095890410959, 0.43243243243243...",0.202101,0.1416,0.130968,0.106043,0.127141,0.187077,...,0.05021,0.065923,0.104937,0.084171,0.14989,0.054762,0.079378,0.056346,0.064275,0.29068
earthquake,"[[0.4251012145748988, 0.3961038961038961, 0.34...","[[0.3076923076923077, 0.4025974025974026, 0.23...","[{'Low': [0.43902439024390244, 0.6304347826086...","[{'Low': [0.3121951219512195, 0.56521739130434...",0.19879,0.155774,0.136921,0.13299,0.006173,0.185111,...,0.030326,0.04304,0.060163,0.039883,0.01,0.037698,0.059038,0.07196,0.066753,0.025852
flood,"[[0.4425531914893617, 0.38215102974828374, 0.4...","[[0.4553191489361702, 0.5102974828375286, 0.55...","[{'Low': [0.4672897196261682, 0.49826989619377...","[{'Medium': [0.2753623188405797, 0.0, 0.102564...",0.222481,0.148776,0.115905,0.111624,0.13142,0.264798,...,0.02539,0.031716,0.055928,0.053553,0.106968,0.027955,0.029163,0.064864,0.037964,0.07769
typhoon,"[[0.42213114754098363, 0.4397163120567376, 0.4...","[[0.4959016393442623, 0.38475177304964536, 0.4...","[{'Low': [0.6058394160583942, 0.56375838926174...","[{'Medium': [0.0821917808219178, 0.05116279069...",0.217954,0.145295,0.141466,0.104312,0.012844,0.251374,...,0.02831,0.04057,0.057983,0.033915,0.01751,0.023641,0.038407,0.058892,0.057699,0.031805
shooting,"[[0.3765432098765432, 0.35183527305282003, 0.6...","[[0.3888888888888889, 0.40107430617726053, 0.6...","[{'Low': [0.38181818181818183, 0.3285899094437...","[{'Low': [0.4909090909090909, 0.38939197930142...",0.252764,0.161721,0.220437,0.091062,0.064464,0.231618,...,0.050532,0.059765,0.086671,0.05874,0.03832,0.045154,0.050021,0.073222,0.035149,0.049598
bombing,"[[0.45652173913043476, 0.5457943925233645, 0.4...","[[0.32065217391304346, 0.4485981308411215, 0.4...","[{'Low': [0.6, 0.6263157894736842, 0.443117178...","[{'Medium': [0.07692307692307693, 0.2558139534...",0.2864,0.177861,0.130975,0.110299,0.083586,0.228205,...,0.035412,0.057187,0.064239,0.055244,0.08114,0.045557,0.012782,0.051644,0.121363,0.065385
covid,"[[0.40743944636678203, 0.2843412094513416, 0.5...","[[0.5308535178777394, 0.32198638366039245, 0.5...","[{'Low': [0.4229784120014636, 0.27173456530869...","[{'Medium': [0.36886993603411516, 0.4757281553...",0.157048,0.106401,0.143802,0.072371,0.005319,0.213951,...,0.031436,0.038061,0.025847,0.032613,0.011111,0.027976,0.041091,0.034505,0.028679,0.0
explosion,"[[0.2706713780918728, 0.4649298597194389]]","[[0.1568904593639576, 0.6913827655310621]]","[{'Low': [0.25426356589147286, 0.5024390243902...","[{'Low': [0.1387596899224806, 0.80487804878048...",0.154832,0.124148,0.137005,0.104779,0.030303,0.201402,...,0.097129,0.124088,0.023709,0.136905,0.05,0.267246,0.333059,0.02521,0.255952,0.116667
storm,"[[0.43471810089020774, 0.386]]","[[0.34198813056379823, 0.35]]","[{'Low': [0.4669421487603306, 0.28318584070796...","[{'High': [0.3702185792349727, 0.5520833333333...",0.32844,0.13475,0.155279,0.150199,0.055771,0.275047,...,0.024359,0.091878,0.011005,0.002305,0.095055,0.004006,0.1217,0.015011,0.090932,0.000549


In [14]:
#Save prio scores
prioScoreDf.to_json("Trec_data/prioScoreDF.json")

filename = 'Trec_data/prio_results.pkl'
outfile = open(filename,'wb')
pickle.dump(prioScoreDf, outfile)
outfile.close()

In [15]:
prioScoreDf = pd.read_json("Trec_data/prioScoreDF.json")
prioScoreDf

Unnamed: 0,genScores,specScores,genLabelScores,specLabelScores,avgF1Gen,avgF1GenLow,avgF1GenMedium,avgF1GenHigh,avgF1GenCritical,avgF1Spec,...,semAccGen,semAccGenLow,semAccGenMedium,semAccGenHigh,semAccGenCritical,semAccSpec,semAccSpecLow,semAccSpecMedium,semAccSpecHigh,semAccSpecCritical
wildfire,"[[0.3155893536, 0.3855243722, 0.5748598879, 0....","[[0.319391635, 0.3633677991, 0.5364291433, 0.5...","[{'Low': [0.3835616438, 0.42471042470000003, 0...","[{'Low': [0.40410958900000005, 0.4324324324, 0...",0.202101,0.1416,0.130968,0.106043,0.127141,0.187077,...,0.05021,0.065923,0.104937,0.084171,0.14989,0.054762,0.079378,0.056346,0.064275,0.29068
earthquake,"[[0.4251012146, 0.3961038961, 0.3495145631, 0....","[[0.3076923077, 0.4025974026, 0.2330097087, 0....","[{'Low': [0.4390243902, 0.6304347826000001, 0....","[{'Low': [0.31219512200000005, 0.5652173913, 0...",0.19879,0.155774,0.136921,0.13299,0.006173,0.185111,...,0.030326,0.04304,0.060163,0.039883,0.01,0.037698,0.059038,0.07196,0.066753,0.025852
flood,"[[0.4425531915, 0.38215102970000003, 0.4695290...","[[0.4553191489, 0.5102974828, 0.55540166200000...","[{'Low': [0.46728971960000004, 0.4982698962, 0...","[{'Medium': [0.2753623188, 0.0, 0.1025641026, ...",0.222481,0.148776,0.115905,0.111624,0.13142,0.264798,...,0.02539,0.031716,0.055928,0.053553,0.106968,0.027955,0.029163,0.064864,0.037964,0.07769
typhoon,"[[0.4221311475, 0.4397163121, 0.46875000000000...","[[0.49590163930000003, 0.384751773, 0.41666666...","[{'Low': [0.6058394161, 0.5637583893, 0.666666...","[{'Medium': [0.0821917808, 0.0511627907, 0.105...",0.217954,0.145295,0.141466,0.104312,0.012844,0.251374,...,0.02831,0.04057,0.057983,0.033915,0.01751,0.023641,0.038407,0.058892,0.057699,0.031805
shooting,"[[0.3765432099, 0.3518352731, 0.6669335468, 0....","[[0.3888888889, 0.4010743062, 0.615692554, 0.2...","[{'Low': [0.3818181818, 0.3285899094, 0.696207...","[{'Low': [0.4909090909, 0.3893919793, 0.637835...",0.252764,0.161721,0.220437,0.091062,0.064464,0.231618,...,0.050532,0.059765,0.086671,0.05874,0.03832,0.045154,0.050021,0.073222,0.035149,0.049598
bombing,"[[0.4565217391, 0.5457943925000001, 0.42829457...","[[0.3206521739, 0.4485981308, 0.4646317829]]","[{'Low': [0.6000000000000001, 0.6263157895, 0....","[{'Medium': [0.0769230769, 0.2558139535, 0.164...",0.2864,0.177861,0.130975,0.110299,0.083586,0.228205,...,0.035412,0.057187,0.064239,0.055244,0.08114,0.045557,0.012782,0.051644,0.121363,0.065385
covid,"[[0.4074394464, 0.2843412095, 0.5410286612, 0....","[[0.5308535179, 0.3219863837, 0.58029053790000...","[{'Low': [0.422978412, 0.2717345653, 0.5803876...","[{'Medium': [0.368869936, 0.4757281553, 0.2571...",0.157048,0.106401,0.143802,0.072371,0.005319,0.213951,...,0.031436,0.038061,0.025847,0.032613,0.011111,0.027976,0.041091,0.034505,0.028679,0.0
explosion,"[[0.2706713781, 0.4649298597]]","[[0.15689045940000002, 0.6913827655]]","[{'Low': [0.2542635659, 0.5024390244], 'Medium...","[{'Low': [0.1387596899, 0.8048780488], 'Medium...",0.154832,0.124148,0.137005,0.104779,0.030303,0.201402,...,0.097129,0.124088,0.023709,0.136905,0.05,0.267246,0.333059,0.02521,0.255952,0.116667
storm,"[[0.4347181009, 0.386]]","[[0.3419881306, 0.35000000000000003]]","[{'Low': [0.4669421488, 0.2831858407], 'Medium...","[{'High': [0.3702185792, 0.5520833333], 'Criti...",0.32844,0.13475,0.155279,0.150199,0.055771,0.275047,...,0.024359,0.091878,0.011005,0.002305,0.095055,0.004006,0.1217,0.015011,0.090932,0.000549


# **`Generate and Test postCategories Models`**

In [24]:
def generate_scores_by_event_Cat(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        #y_train = training[target]        
        X_test = test[features]
        #y_test = test[target]
        
        y_train = []
        for val in training[target]:
            y_train.append(np.array(val))
        y_train= np.array(y_train)
            
        y_test = []
        for val in test[target]:
            y_test.append(np.array(val))
        y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
    return [accuracy_accum, f1_accum]#, label_score_accum, label_f1_accum] #Accuracy is 0, F1 is 1, label Acc is 2, label F1 is 3

In [None]:
catLabel = 'sparseCategories'
catModel = MultiOutputClassifier(RandomForestClassifier(**rf_params))

genCatScores = {}
specCatScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType']==event]
    genCatScores[event] = generate_scores_by_event_Cat(df, event, features, catLabel, catModel)
    specCatScores[event] = generate_scores_by_event_Cat(eventDF, event, features, catLabel, catModel)


Events:   0%|          | 0/9 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [11:02<44:10, 662.56s/it][A
wildfire:  40%|████      | 2/5 [22:00<33:03, 661.21s/it][A
wildfire:  60%|██████    | 3/5 [32:37<21:48, 654.04s/it][A
wildfire:  80%|████████  | 4/5 [43:20<10:50, 650.48s/it][A
wildfire: 100%|██████████| 5/5 [54:18<00:00, 651.64s/it][A

wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:31<02:04, 31.03s/it][A
wildfire:  40%|████      | 2/5 [00:59<01:30, 30.32s/it][A
wildfire:  60%|██████    | 3/5 [01:17<00:53, 26.71s/it][A
wildfire:  80%|████████  | 4/5 [01:39<00:25, 25.03s/it][A
wildfire: 100%|██████████| 5/5 [02:09<00:00, 25.86s/it][A
Events:  11%|█         | 1/9 [56:27<7:31:42, 3387.78s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [10:59<1:49:52, 659.28s/it][A
earthquake:  18%|█▊        | 2/11 [22:00<1:38:59, 659.93s/it][A
earthquake:  2

In [None]:
#Store cat scores in readable format
catScoreDf = pd.DataFrame(columns=['genScores', 'specScores',
                                   'avgAccGen', 'avgAccSpec', 'avgF1Gen', 'avgF1Spec',
                                   'stdAccGen', 'stdAccSpec', 'stdF1Gen', 'stdF1Spec'])
for event in eventTypes:
    row = pd.Series(
        {
            'genScores': genCatScores[event], 'specScores': specCatScores[event],
            'avgAccGen': np.mean(genCatScores[event][0]), 'avgAccSpec': np.mean(specCatScores[event][0]),
            'avgF1Gen': np.mean(genCatScores[event][1]), 'avgF1Spec': np.mean(specCatScores[event][1]),
            'stdAccGen': np.std(genCatScores[event][0]), 'stdAccSpec': np.std(specCatScores[event][0]),
            'stdF1Gen': np.std(genCatScores[event][1]), 'stdF1Spec': np.std(specCatScores[event][1])
        }, name=event)
    catScoreDf = catScoreDf.append(row)
    
catScoreDf

In [None]:
#Save cat scores
catScoreDf.to_json("Trec_data/catScoreDF.json")

filename = 'Trec_data/cat_results.pkl'
outfile = open(filename,'wb')
pickle.dump(catScoreDf,outfile)
outfile.close()

In [None]:
catScoreDf = pd.read_json("Trec_data/catScoreDF.json")
catScoreDf