In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import clone
from scipy import stats
import sklearn.preprocessing
import pickle
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random



In [2]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("Trec_data/Preprocessed_labelled.json", orient='records', lines=True)
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos,regression_priority,sparseCategories
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,28,0,29,-0.3400,0.091,0.909,0.000,0.25,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,10,11,1,11,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,25,0,28,0.0000,0.000,1.000,0.000,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,26,29,0,32,0.0552,0.177,0.610,0.214,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,23,25,0,26,0.0000,0.000,1.000,0.000,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,24,21,1,24,-0.3412,0.099,0.901,0.000,0.25,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,16,16,1,18,-0.0516,0.173,0.663,0.163,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,10,12,1,13,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,12,13,1,13,-0.8271,0.490,0.510,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ..."


# **`Generate Event Types`**

In [3]:
fullEventTypes = df['eventType'].unique()
eventTypes = []
for event in fullEventTypes:
    events = df.loc[df['eventType'] == event]['eventID'].unique()
    if events.size > 1:
        eventTypes.append(event)
print(eventTypes)

['wildfire', 'earthquake', 'flood', 'typhoon', 'shooting', 'bombing', 'covid', 'explosion', 'storm']


In [4]:
#Generate Event Ensembles
EventEnsembles = {}
EventEnsembles['wildfire'] = ['wildfire', 'flood', 'typhoon']
EventEnsembles['earthquake'] = ['earthquake', 'typhoon', 'flood']
EventEnsembles['flood'] = ['flood', 'typhoon', 'storm']
EventEnsembles['typhoon'] = ['typhoon', 'flood', 'storm']
EventEnsembles['shooting'] = ['shooting', 'flood', 'wildfire']
EventEnsembles['bombing'] = ['bombing', '', ''] #Need to go back to fix sim matrixes
EventEnsembles['covid'] = ['covid', '', '']
EventEnsembles['explosion'] = ['explosion', '', '']
EventEnsembles['storm'] = ['storm', 'flood', 'typhoon'] #Same as typhoon input

# **`Model Related Methods`**

In [5]:
def train_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    if type(heldout_ids) != list:
        heldout_ids = [heldout_ids]
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [6]:
#Currently unused
def save_model(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename):
    model = pickle.load(open(filename, 'rb'))
    return model

# **`Generate Generic Variables`**

In [None]:
features = ["num_chars", "num_chars_total", 
            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
            "vader neu", "vader compound", 
            "num_hashtags", "num_mentions", 
            "num_urls", 
            "is_retweet", "num_media",
            "is_verified", 
            "caps_ratio"]

#I think you need to make a list of lists

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
    'verbose': 0
}

# **`Generate and Test postPriority Models`**

In [None]:
def generate_scores_by_event_Prio(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    labels = data[target].unique()
    label_f1_accum = {} #Dict by unique labels
    label_score_accum = {}
    for label in labels:
        label_f1_accum[label] = []
        label_score_accum[label] = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        y_train = training[target]        
        X_test = test[features]
        y_test = test[target]
        
        if isinstance(y_train, pd.Series):
            y_train = []
            for val in training[target]:
                y_train.append(np.array(val))
            y_train= np.array(y_train)
        if isinstance(y_test, pd.Series):
            y_test = []
            for val in test[target]:
                y_test.append(np.array(val))
            y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
        #Seperate scores per label
        for label in labels:
            label_ids = test[target]==label
            x_label = X_test[label_ids]
            y_label = y_test[label_ids]
            y_infer_label = y_infer_local[label_ids]
            
            if x_label.size == 0:
                continue
            
            label_f1 = f1_score(y_label, y_infer_label, average="macro", zero_division=0)
            label_score = model.score(x_label, y_label)
            
            label_f1_accum[label].append(label_f1)
            label_score_accum[label].append(label_score)
        
        
    for label in labels: #Prevent blank
        if len(label_f1_accum[label])==0:
            label_f1_accum.pop(label, None)
            label_score_accum.pop(label, None)
        
    return [accuracy_accum, f1_accum, label_score_accum, label_f1_accum] #Accuracy is 0, F1 is 1, label Acc is 2, label F1 is 3

In [None]:
prioLabel = 'postPriority'
prioModel = RandomForestClassifier(**rf_params)

#genPrioScores = {}
ensPrioScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType'].isin(EventEnsembles[event])]
    #genPrioScores[event] = generate_scores_by_event_Prio(df, event, features, prioLabel, prioModel)
    ensPrioScores[event] = generate_scores_by_event_Prio(eventDF, event, features, prioLabel, prioModel)
    

In [None]:
#Store prio scores in readable format
labels = ['Low', 'Medium', 'High', 'Critical']

prioEnsScoreDf = pd.DataFrame()#columns=cols)
for event in eventTypes:
    row = pd.Series(
        {
            'ensScores': ensPrioScores[event][0:1],
            'ensLabelScores': ensPrioScores[event][2:3],
            'avgAccEns': np.mean(ensPrioScores[event][0]),
            'avgF1Ens': np.mean(ensPrioScores[event][1]),
            'semAccEns': stats.sem(ensPrioScores[event][0]),
            'semF1Ens': stats.sem(ensPrioScores[event][1])
        }, name=event)
    #Add label specific columns
    for label in labels:
        if label in label in ensPrioScores[event][2]:
            labelCol = pd.Series(
                {
                    'avgAccEns' + label: np.mean(ensPrioScores[event][2][label]),
                    'avgF1Ens' + label: np.mean(ensPrioScores[event][3][label]),
                    'semAccEns' + label: stats.sem(ensPrioScores[event][2][label]),
                    'semF1Ens' + label: stats.sem(ensPrioScores[event][3][label])
                })
            row = row.append(labelCol)
    row.name = event
    prioEnsScoreDf = prioEnsScoreDf.append(row)
    
#Reorder for easy readability
cols = ['ensScores', 'ensLabelScores']
scoreTypes = ['F1', 'Acc']
accumTypes = ['avg', 'sem']
testTypes = ['Ens']

for score in scoreTypes:
    for accum in accumTypes:
        for test in testTypes:
            cols.append(accum+score+test)
            for label in labels:
                cols.append(accum+score+test+label)

print(cols)
prioEnsScoreDf = prioEnsScoreDf[cols]
prioEnsScoreDf

In [None]:
prioScoreDf = pd.read_json("Trec_data/prioScoreDF.json")
prioScoreDf

In [None]:
prioBothScoreDf = pd.concat([prioScoreDf, prioEnsScoreDf], axis=1)
prioBothScoreDf

In [None]:
#Save prio scores
prioBothScoreDf.to_json("Trec_data/prioScoreDF.json")

filename = 'Trec_data/prio_results.pkl'
outfile = open(filename,'wb')
pickle.dump(prioBothScoreDf, outfile)
outfile.close()

In [None]:
prioBothScoreDf = pd.read_json("Trec_data/prioScoreDF.json")
prioBothScoreDf

# **`Generate and Test postCategories Models`**

In [None]:
def generate_scores_by_event_Cat(data, event, features, target, modelType):
    f1_accum = []
    accuracy_accum = []
    
    eventIDs = data.loc[data['eventType']==event]['eventID'].unique()
    for heldoutEvent in tqdm(eventIDs, position=1,desc=event):
        #Create training and test dataframe
        training = train_data(data, 'eventID', heldoutEvent)
        test = test_data(data, 'eventID', heldoutEvent)
        
        X_train = training[features]
        #y_train = training[target]        
        X_test = test[features]
        #y_test = test[target]
        
        y_train = []
        for val in training[target]:
            y_train.append(np.array(val))
        y_train= np.array(y_train)
            
        y_test = []
        for val in test[target]:
            y_test.append(np.array(val))
        y_test= np.array(y_test)
            
        
        #generate model
        model = clone(modelType)
        model.fit(X_train, y_train)
        
        #Test model
        y_infer_local = model.predict(X_test)
        local_f1 = f1_score(y_test, y_infer_local, average="macro", zero_division=0)
        local_score = model.score(X_test, y_test)
        
        accuracy_accum.append(local_score)
        f1_accum.append(local_f1)
        
    return [accuracy_accum, f1_accum] #Accuracy is 0, F1 is 1

In [None]:
catLabel = 'sparseCategories'
catModel = MultiOutputClassifier(RandomForestClassifier(**rf_params))

#genCatScores = {}
ensCatScores = {}

#generate general model
for event in tqdm(eventTypes, position=0, desc='Events'):
    #print('Event: ' + event)
    eventDF = df.loc[df['eventType'].isin(EventEnsembles[event])]
    #genCatScores[event] = generate_scores_by_event_Cat(df, event, features, catLabel, catModel)
    ensCatScores[event] = generate_scores_by_event_Cat(eventDF, event, features, catLabel, catModel)


In [None]:
#Store cat scores in readable format
catEnsScoreDf = pd.DataFrame(columns=['ensScores',
                                   'avgAccEns', 'avgF1Ens',
                                   'stdAccEns', 'stdF1Ens'])
for event in eventTypes:
    row = pd.Series(
        {
            'ensScores': ensCatScores[event],
            'avgAccEns': np.mean(ensCatScores[event][0]),
            'avgF1Ens': np.mean(ensCatScores[event][1]),
            'stdAccEns': np.std(ensCatScores[event][0]),
            'stdF1Ens': np.std(ensCatScores[event][1])
        }, name=event)
    catEnsScoreDf = catEnsScoreDf.append(row)
    
catEnsScoreDf

In [None]:
catScoreDf = pd.read_json("Trec_data/catScoreDF.json")
catScoreDf

In [None]:
#Combine the new data into the old data
catBothScoreDf = pd.concat([catScoreDf, catEnsScoreDf], axis=1)
catBothScoreDf

In [None]:
#Save cat scores
catBothScoreDf.to_json("Trec_data/catScoreDF.json")

filename = 'Trec_data/cat_results.pkl'
outfile = open(filename,'wb')
pickle.dump(catBothScoreDf,outfile)
outfile.close()

In [None]:
catBothScoreDf = pd.read_json("Trec_data/catScoreDF.json")
catBothScoreDf