In [2]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random



In [3]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("./Trec_data/processed.json", orient='records',lines=True)

In [4]:
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,processed_text
0,stormJorge2020,typhoon,1231307896362807296,[Irrelevant],Low,Flood Warning: River Severn at Hanley Castle a...,"[flood, february]"
1,stormJorge2020,typhoon,1231569665043976192,[Irrelevant],Low,Flood Warning: River Ouse at Naburn Lock 12:46...,"[flood, naburn_lock, february]"
2,stormJorge2020,typhoon,1232264304067477504,[Irrelevant],Low,Our Assistant Director of Care and Support kin...,"[assistant, director, care, support, kindly, l..."
3,stormJorge2020,typhoon,1232070602778959872,[Irrelevant],Low,@hollywills please can you help support @HopeR...,"[help, support, following, recent, flooding, c..."
4,stormJorge2020,typhoon,1232648900105965568,[Irrelevant],Low,Police order 'immediate evacuation' in Shropsh...,"[police, order, flooding, send]"
...,...,...,...,...,...,...,...
91510,whaleyBridgeCollapse2020,flood,1155430270457323520,[Irrelevant],Low,Flood Alert: River Ecclesbourne in Derbyshire ...,"[flood, alert, river]"
91511,whaleyBridgeCollapse2020,flood,1156993824591417344,"[Location, EmergingThreats, MultimediaShare, N...",High,Dam at Whaley Bridge in Peak District threaten...,"[peak, district, threaten, burst, gofh, pb, nc..."
91512,whaleyBridgeCollapse2020,flood,1157020257388769280,"[ThirdPartyObservation, Location, MultimediaSh...",Low,Floods in Whaley Bridge today.\nhttps://t.co/7...,"[flood, today]"
91513,whaleyBridgeCollapse2020,flood,1156926115069485056,"[MovePeople, ThirdPartyObservation, Location, ...",Critical,Evacuation of Whaley Bridge | Derbyshire Const...,"[evacuation, constabulary]"


# **`Generate Heldout Events`**

In [5]:
eventTypes = df['eventType'].unique()
print(eventTypes)

['typhoon' 'storm' 'wildfire' 'covid' 'flood' 'bombing' 'shooting'
 'earthquake' 'explosion' 'hostage' 'fire' 'tornado']


In [6]:
heldout_events = pd.DataFrame(columns=['eventID'])

#Choose heldout event and saves in the heldout_events dataframe
for event in eventTypes:
    crises = df.loc[df['eventType']==event]['eventID'].unique()
    heldout_events.loc[event]=[crises[random.choice(np.arange(crises.size))]]

heldout_events.to_json('./Trec_data/heldout_events.json')

In [7]:
#Simple read to keep the index by events
heldout_events = pd.read_json('./Trec_data/heldout_events.json')
heldout_events

Unnamed: 0,eventID
bombing,bostonBombings2013
covid,covidHouston2020
earthquake,philippinesEarthquake2019
explosion,beirutExplosion2020
fire,sanFranciscoPierFire2020
flood,albertaFloods2013
hostage,virraMallHostageSituation2020
shooting,brooklynBlockPartyShooting2020
storm,southeastTornadoOutbreak2020
tornado,tennesseeTornadoOutbreak2020


# **`Model Related Methods`**

In [8]:
def train_data(data, column, heldout_ids):
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [18]:
def generate_model(data, features, target, modelParameters):
    
    model = RandomForestClassifier(**modelParameters)
    model.fit(data[features].to_numpy(), data[target].to_numpy())
    
    return model

In [10]:
def generate_model_by_events(data, features, target, modelParameters):
    modelList = pd.DataFrame(columns=['model'])
    for event in tqdm(eventTypes):
        #Create training and test dataframe
        training = train_data(df, 'eventID', heldout_crisis)
        
        #generate event specific model
        model = generate_model(training, features, target, modelParameters)
        
        #Add model to list
        modelList.loc[event] = [model]
        
        #print('')
    return modelList

In [None]:
def test_model(data, features, target, model):
    

# **`Generate Generic Variables`**

In [14]:
features = ['vectorized_text'] #put in list of columns you want it to be trained on (Other features etc.)
features.append(other_features_names)

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
}

#Training data withholding all heldout events for general models
generalTraining = train_data(df, 'eventID', heldout_events['eventID'].tolist())
print(generalTraining.shape)

(78746, 7)


# **`Generate postPriority Models`**

In [20]:
generalTraining[features].to_numpy()

array([[list(['flood', 'february'])],
       [list(['flood', 'naburn_lock', 'february'])],
       [list(['assistant', 'director', 'care', 'support', 'kindly', 'lend', 'local', 'resilience', 'forum', 'colleague', 'kitchen', 'yesterday', 'assist', 'flooding', 'selby', 'yesterday'])],
       ...,
       [list(['flood', 'today'])],
       [list(['evacuation', 'constabulary'])],
       [list(['pray', 'safe'])]], dtype=object)

In [19]:
#generate general model
genPrioModel = generate_model(generalTraining, features, 'postPriority', rf_params)

#generate event specific models
#specPrioModels = generate_model_by_events(df, features, 'postPriority', rf_params)

ValueError: setting an array element with a sequence.

In [None]:
#Save postPriority models
model.save(model/postPriority.h5)

# **`Generate postCategories Models`**

In [None]:
#generate general model
catLabels = ['postCategories'] #target
genCatModel =  generate_model(generalTraining, features, catLabels, rf_params)

#generate event specific models
specCatModels = generate_model_by_events(df, features, catLabels, rf_params)

In [None]:
#Save postCategories models

# **`Test All Models`**

In [None]:
#Load All Models
genPrioModel
specPrioModels
genCatModel
specCatModels

In [None]:
#Test postPriority models

In [None]:
#Test postCategories models