In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import clone
import sklearn.preprocessing
import pickle
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random



In [2]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("./Trec_data/Preprocessed_labelled.json", orient='records',lines=True)
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,text,entities,extended_entities,favorite_count,hashtagEntities,...,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos,regression_priority,sparseCategories
0,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,25,0,28,0.0000,0.000,1.000,0.00,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,23,25,0,26,0.0000,0.000,1.000,0.00,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,fireColorado2012,wildfire,217732012314861568,[FirstPartyObservation],Medium,2 wildfires in Boulder County. We can see smok...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,12,13,0,14,0.0000,0.000,1.000,0.00,0.50,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,fireColorado2012,wildfire,216961334129078272,[Discussion],Low,RT @Jon_G3: Seeing 1/3 of Colorado on fire mak...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,22,22,0,23,-0.6124,0.306,0.563,0.13,0.25,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,fireColorado2012,wildfire,212552860590813184,[MultimediaShare],Medium,RT @dhorning11: RT @LarimerCounty: #HighParkFi...,"{'symbols': [], 'urls': [{'expanded_url': 'htt...",,0.0,,...,15,17,1,20,0.0000,0.000,1.000,0.00,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38071,tennesseeTornadoOutbreak2020,tornado,1235236359310368768,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Hottest Google Search in 31.2 hrs. Nashville t...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,16,14,1,22,0.0000,0.000,1.000,0.00,0.25,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
38072,tennesseeTornadoOutbreak2020,tornado,1235337290144239616,"[ThirdPartyObservation, Location, MultimediaSh...",Low,A live report is next on the Nashville tornado...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,0.0,,...,11,11,0,11,0.0000,0.000,1.000,0.00,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
38073,tennesseeTornadoOutbreak2020,tornado,1235258820139638784,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Officials are still cleaning up after tornadoe...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,0.0,,...,20,21,0,22,0.0000,0.000,1.000,0.00,0.25,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
38074,tennesseeTornadoOutbreak2020,tornado,1235253249957126144,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Putnam County: Cookeville area tornado victims...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,8,9,1,9,-0.3182,0.247,0.753,0.00,0.25,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."


# **`Generate Event Types`**

In [3]:
fullEventTypes = df['eventType'].unique()
eventTypes = []
for event in fullEventTypes:
    events = df.loc[df['eventType'] == event]['eventID'].unique()
    if events.size > 1:
        eventTypes.append(event)
print(eventTypes)

['wildfire', 'earthquake', 'flood', 'typhoon', 'shooting', 'bombing', 'storm']


# **`Model Related Methods`**

In [4]:
def train_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [5]:
def generate_scores_by_event(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        y_train = training[target]        
        X_test = test[features]
        y_test = test[target]
        
        if isinstance(y_train, pd.Series):
            y_train = []
            for val in training[target]:
                y_train.append(np.array(val))
            y_train= np.array(y_train)
        if isinstance(y_test, pd.Series):
            y_test = []
            for val in test[target]:
                y_test.append(np.array(val))
            y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        #model = generate_model(training, features, target, modelType)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0) #Should the average be changed?
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
    return [accuracy_accum, f1_accum] #Accuracy is 0, F1 is 1

In [6]:
#Currently unused
def save_model(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename):
    model = pickle.load(open(filename, 'rb'))
    return model

# **`Generate Generic Variables`**

In [7]:
features = ["num_chars", "num_chars_total", 
            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
            "vader neu", "vader compound", 
            "num_hashtags", "num_mentions", 
            "num_urls", 
            "is_retweet", "num_media",
            "is_verified", 
            "caps_ratio"]

#I think you need to make a list of lists

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
    'verbose': 0
}

# **`Generate and Test postPriority Models`**

In [9]:
prioLabel = 'postPriority'
prioModel = RandomForestClassifier(**rf_params) #(**modelParameters)

genPrioScores = {}
specPrioScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType']==event]
    genPrioScores[event] = generate_scores_by_event(df, event, features, prioLabel, prioModel)
    specPrioScores[event] = generate_scores_by_event(eventDF, event, features, prioLabel, prioModel)
    

Events:   0%|          | 0/7 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:12<00:50, 12.73s/it][A
wildfire:  40%|████      | 2/5 [00:25<00:38, 12.79s/it][A
wildfire:  60%|██████    | 3/5 [00:38<00:25, 12.68s/it][A
wildfire:  80%|████████  | 4/5 [00:50<00:12, 12.64s/it][A
wildfire: 100%|██████████| 5/5 [01:03<00:00, 12.72s/it][A

wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:01<00:04,  1.07s/it][A
wildfire:  40%|████      | 2/5 [00:02<00:03,  1.05s/it][A
wildfire:  60%|██████    | 3/5 [00:02<00:01,  1.04it/s][A
wildfire:  80%|████████  | 4/5 [00:03<00:00,  1.09it/s][A
wildfire: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s][A
Events:  14%|█▍        | 1/7 [01:08<06:50, 68.34s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [00:13<02:10, 13.08s/it][A
earthquake:  18%|█▊        | 2/11 [00:26<01:57, 13.08s/it][A
earthquake:  27%|██▋       | 

In [10]:
#Store prio scores in readable format
prioScoreDf = pd.DataFrame(columns=['genScores', 'specScores',
                                    'avgAccGen', 'avgAccSpec', 'avgF1Gen', 'avgF1Spec',
                                    'stdAccGen', 'stdAccSpec', 'stdF1Gen', 'stdF1Spec'])
for event in eventTypes:
    row = pd.Series(
        {
            'genScores': genPrioScores[event], 'specScores': specPrioScores[event],
            'avgAccGen': np.mean(genPrioScores[event][0]), 'avgAccSpec': np.mean(specPrioScores[event][0]),
            'avgF1Gen': np.mean(genPrioScores[event][1]), 'avgF1Spec': np.mean(specPrioScores[event][1]),
            'stdAccGen': np.std(genPrioScores[event][0]), 'stdAccSpec': np.std(specPrioScores[event][0]),
            'stdF1Gen': np.std(genPrioScores[event][1]), 'stdF1Spec': np.std(specPrioScores[event][1])
        }, name=event)
    prioScoreDf = prioScoreDf.append(row)
    
prioScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,stdAccGen,stdAccSpec,stdF1Gen,stdF1Spec
wildfire,"[[0.3464052287581699, 0.33511586452762926, 0.5...","[[0.37254901960784315, 0.4117647058823529, 0.5...",0.399168,0.43085,0.198208,0.187518,0.081312,0.094574,0.025732,0.039396
earthquake,"[[0.3389830508474576, 0.44545454545454544, 0.2...","[[0.4067796610169492, 0.39090909090909093, 0.3...",0.387411,0.400638,0.189896,0.210901,0.108158,0.100199,0.065743,0.061783
flood,"[[0.46994535519125685, 0.363013698630137, 0.38...","[[0.4644808743169399, 0.4041095890410959, 0.34...",0.385024,0.451105,0.204074,0.237013,0.08492,0.104282,0.08753,0.071933
typhoon,"[[0.4606741573033708, 0.38209606986899564, 0.4...","[[0.5056179775280899, 0.3864628820960699, 0.41...",0.388729,0.474315,0.20831,0.243035,0.081501,0.077312,0.070284,0.058649
shooting,"[[0.3283582089552239, 0.3264094955489614, 0.63...","[[0.3880597014925373, 0.5192878338278932, 0.64...",0.43592,0.40355,0.244393,0.236753,0.145294,0.133933,0.096777,0.10317
bombing,"[[0.4583333333333333, 0.4967948717948718, 0.34...","[[0.38333333333333336, 0.4391025641025641, 0.5...",0.434117,0.464192,0.253625,0.237865,0.063418,0.078299,0.05092,0.024966
storm,"[[0.4433497536945813, 0.37333333333333335], [0...","[[0.18226600985221675, 0.37333333333333335], [...",0.408342,0.2778,0.298814,0.230705,0.035008,0.095534,0.031746,0.055244


In [11]:
#Save prio scores
prioScoreDf.to_json("./Trec_data/prioScoreDF.json")

filename = './Trec_data/prio_results.pkl'
outfile = open(filename,'wb')
pickle.dump(prioScoreDf, outfile)
outfile.close()

In [12]:
prioScoreDf = pd.read_json("./Trec_data/prioScoreDF.json")
prioScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,stdAccGen,stdAccSpec,stdF1Gen,stdF1Spec
wildfire,"[[0.3464052288, 0.3351158645, 0.5447338618, 0....","[[0.37254901960000003, 0.4117647059, 0.5537938...",0.399168,0.43085,0.198208,0.187518,0.081312,0.094574,0.025732,0.039396
earthquake,"[[0.3389830508, 0.4454545455, 0.2121212121, 0....","[[0.406779661, 0.39090909090000003, 0.37878787...",0.387411,0.400638,0.189896,0.210901,0.108158,0.100199,0.065743,0.061783
flood,"[[0.4699453552, 0.3630136986, 0.3856812933, 0....","[[0.4644808743, 0.40410958900000005, 0.3418013...",0.385024,0.451105,0.204074,0.237013,0.08492,0.104282,0.08753,0.071933
typhoon,"[[0.4606741573, 0.3820960699, 0.4342105263, 0....","[[0.5056179775, 0.3864628821, 0.4144736842, 0....",0.388729,0.474315,0.20831,0.243035,0.081501,0.077312,0.070284,0.058649
shooting,"[[0.328358209, 0.32640949550000004, 0.63501646...","[[0.3880597015, 0.5192878338, 0.6405049396, 0....",0.43592,0.40355,0.244393,0.236753,0.145294,0.133933,0.096777,0.10317
bombing,"[[0.45833333330000003, 0.49679487180000004, 0....","[[0.3833333333, 0.43910256410000004, 0.5701388...",0.434117,0.464192,0.253625,0.237865,0.063418,0.078299,0.05092,0.024966
storm,"[[0.4433497537, 0.3733333333], [0.3305603022, ...","[[0.1822660099, 0.3733333333], [0.1754611262, ...",0.408342,0.2778,0.298814,0.230705,0.035008,0.095534,0.031746,0.055244


# **`Generate and Test postCategories Models`**

In [13]:
catLabel = 'sparseCategories'
catModel = MultiOutputClassifier(RandomForestClassifier(**rf_params))

genCatScores = {}
specCatScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType']==event]
    genCatScores[event] = generate_scores_by_event(df, event, features, catLabel, catModel)
    specCatScores[event] = generate_scores_by_event(eventDF, event, features, catLabel, catModel)


Events:   0%|          | 0/7 [00:00<?, ?it/s]
wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [04:32<18:11, 272.89s/it][A
wildfire:  40%|████      | 2/5 [09:03<13:37, 272.34s/it][A
wildfire:  60%|██████    | 3/5 [13:20<08:54, 267.47s/it][A
wildfire:  80%|████████  | 4/5 [17:42<04:26, 266.04s/it][A
wildfire: 100%|██████████| 5/5 [22:13<00:00, 266.69s/it][A

wildfire:   0%|          | 0/5 [00:00<?, ?it/s][A
wildfire:  20%|██        | 1/5 [00:21<01:27, 21.77s/it][A
wildfire:  40%|████      | 2/5 [00:41<01:03, 21.12s/it][A
wildfire:  60%|██████    | 3/5 [00:54<00:37, 18.67s/it][A
wildfire:  80%|████████  | 4/5 [01:09<00:17, 17.67s/it][A
wildfire: 100%|██████████| 5/5 [01:29<00:00, 17.89s/it][A
Events:  14%|█▍        | 1/7 [23:42<2:22:17, 1422.94s/it]
earthquake:   0%|          | 0/11 [00:00<?, ?it/s][A
earthquake:   9%|▉         | 1/11 [04:30<45:07, 270.75s/it][A
earthquake:  18%|█▊        | 2/11 [09:02<40:40, 271.13s/it][A
earthquake:  27%|█

In [22]:
#Store cat scores in readable format
catScoreDf = pd.DataFrame(columns=['genScores', 'specScores',
                                   'avgAccGen', 'avgAccSpec', 'avgF1Gen', 'avgF1Spec',
                                   'stdAccGen', 'stdAccSpec', 'stdF1Gen', 'stdF1Spec'])
for event in eventTypes:
    row = pd.Series(
        {
            'genScores': genCatScores[event], 'specScores': specCatScores[event],
            'avgAccGen': np.mean(genCatScores[event][0]), 'avgAccSpec': np.mean(specCatScores[event][0]),
            'avgF1Gen': np.mean(genCatScores[event][1]), 'avgF1Spec': np.mean(specCatScores[event][1]),
            'stdAccGen': np.std(genCatScores[event][0]), 'stdAccSpec': np.std(specCatScores[event][0]),
            'stdF1Gen': np.std(genCatScores[event][1]), 'stdF1Spec': np.std(specCatScores[event][1])
        }, name=event)
    catScoreDf = catScoreDf.append(row)
    
catScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,stdAccGen,stdAccSpec,stdF1Gen,stdF1Spec
wildfire,"[[0.006535947712418301, 0.0017825311942959, 0....","[[0.006535947712418301, 0.0017825311942959, 0....",0.021559,0.0025,0.153854,0.139205,0.020001,0.002177,0.040578,0.034574
earthquake,"[[0.01694915254237288, 0.01818181818181818, 0....","[[0.005649717514124294, 0.00909090909090909, 0...",0.028901,0.010777,0.125732,0.117239,0.022097,0.009528,0.043702,0.036152
flood,"[[0.02185792349726776, 0.0, 0.0023094688221709...","[[0.01092896174863388, 0.0, 0.0, 0.00334448160...",0.047879,0.048423,0.141255,0.131016,0.057599,0.065098,0.055937,0.053121
typhoon,"[[0.0056179775280898875, 0.013100436681222707,...","[[0.0056179775280898875, 0.004366812227074236,...",0.04204,0.019895,0.132398,0.122341,0.02821,0.010345,0.06742,0.055603
shooting,"[[0.007462686567164179, 0.001483679525222552, ...","[[0.007462686567164179, 0.002967359050445104, ...",0.086057,0.117026,0.136266,0.131367,0.081442,0.118685,0.045523,0.044265
bombing,"[[0.03333333333333333, 0.02564102564102564, 0....","[[0.0, 0.022435897435897436, 0.000694444444444...",0.027297,0.00771,0.143216,0.113067,0.004411,0.010417,0.030101,0.025661
storm,"[[0.013136288998357963, 0.0], [0.2170454200136...","[[0.0, 0.0], [0.16055900712677387, 0.202134446...",0.006568,0.0,0.213506,0.181347,0.006568,0.0,0.003539,0.020788


In [23]:
#Save cat scores
catScoreDf.to_json("./Trec_data/catScoreDF.json")

filename = './Trec_data/cat_results.pkl'
outfile = open(filename,'wb')
pickle.dump(catScoreDf,outfile)
outfile.close()

In [24]:
catScoreDf = pd.read_json("./Trec_data/catScoreDF.json")
catScoreDf

Unnamed: 0,genScores,specScores,avgAccGen,avgAccSpec,avgF1Gen,avgF1Spec,stdAccGen,stdAccSpec,stdF1Gen,stdF1Spec
wildfire,"[[0.0065359477, 0.0017825312, 0.0084937712, 0....","[[0.0065359477, 0.0017825312, 0.0016987542, 0....",0.021559,0.0025,0.153854,0.139205,0.020001,0.002177,0.040578,0.034574
earthquake,"[[0.016949152500000002, 0.0181818182, 0.015151...","[[0.0056497175, 0.0090909091, 0.0, 0.019379845...",0.028901,0.010777,0.125732,0.117239,0.022097,0.009528,0.043702,0.036152
flood,"[[0.0218579235, 0.0, 0.0023094688, 0.010033444...","[[0.0109289617, 0.0, 0.0, 0.0033444816, 0.0123...",0.047879,0.048423,0.141255,0.131016,0.057599,0.065098,0.055937,0.053121
typhoon,"[[0.0056179775, 0.0131004367, 0.03947368420000...","[[0.0056179775, 0.0043668122, 0.02631578950000...",0.04204,0.019895,0.132398,0.122341,0.02821,0.010345,0.06742,0.055603
shooting,"[[0.007462686600000001, 0.0014836795, 0.083973...","[[0.007462686600000001, 0.0029673591, 0.059275...",0.086057,0.117026,0.136266,0.131367,0.081442,0.118685,0.045523,0.044265
bombing,"[[0.033333333300000004, 0.025641025600000002, ...","[[0.0, 0.0224358974, 0.0006944444], [0.0829538...",0.027297,0.00771,0.143216,0.113067,0.004411,0.010417,0.030101,0.025661
storm,"[[0.013136289, 0.0], [0.21704542000000002, 0.2...","[[0.0, 0.0], [0.1605590071, 0.2021344467]]",0.006568,0.0,0.213506,0.181347,0.006568,0.0,0.003539,0.020788


## Visualizing Data

In [None]:
#priority
infile = open('./Trec_data/prio_results.pkl','rb')
prioDF = pickle.load(infile)
infile.close()

#plt.figure(figsize=(7, 5))
ax = plt.axes()

genF1 = [item for item in prioDF['genF1']]
specF1 = [item for item in prioDF['specF1']]
index = [item for item in prioDF['Event']]

df_prio = pd.DataFrame({'genF1': genF1,

                   'specF1': specF1}, index=index)

ax.set_title('Priority Types')
ax = df_prio.plot.bar(rot=0, ax=ax)

plt.show()

In [None]:
#categories
infile = open('./Trec_data/cat_results.pkl','rb')
prioDF = pickle.load(infile)
infile.close()

ax = plt.axes()

catgenF1 = [item for item in catDF['genF1']]
catspecF1 = [item for item in catDF['specF1']]
index = [item for item in catDF['Event']]

df2 = pd.DataFrame({'genF1': catgenF1,

                   'specF1': catspecF1}, index=index)

ax.set_title('Information Types')

ax = df2.plot.bar(rot=0, ax=ax)
plt.show()