In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import clone
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random

In [None]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)
df

# **`Generate Heldout Events`**

In [None]:
fullEventTypes = df['eventType'].unique()
eventTypes = []
for event in fullEventTypes:
    events = df.loc[df['eventType' == event]][eventID].unique()
    if events.size > 1:
        eventTypes.append(event)
print(eventTypes)

In [None]:
heldout_events = {}

#Choose heldout event and saves in the heldout_events dataframe
for event in eventTypes:
    crises = df.loc[df['eventType']==event]['eventID'].unique()
    heldout_events[event]=[crises[random.choice(np.arange(crises.size))]]

heldout_events.to_json('./Trec_data/heldout_events.json')

In [None]:
#Simple read to keep the index by events
heldout_events = pd.read_json('./Trec_data/heldout_events.json')
heldout_events

# **`Model Related Methods`**

In [None]:
def train_data(data, column, heldout_ids):
    training = data.loc[~data[column].isin(heldout_ids)]
    
    return training

def test_data(data, column, heldout_ids):
    test = data.loc[data[column].isin(heldout_ids)]
    
    return test

In [None]:
def generate_model(data, features, target, modelType):
    
    model = clone(modelType)
    model.fit(data[features], data[target])
    
    return model

In [None]:
def generate_model_by_events(data, features, target, modelType):
    modelList = {}
    for event in tqdm(eventTypes):
        #Create training and test dataframe
        eventDF = df.loc[df['eventType']==event]
        
        training = train_data(eventDF, 'eventID', heldout_events[event])
        
        #generate event specific model
        model = generate_model(training, features, target, modelType)
        
        #Add model to list
        modelList[event] = model
        
        #print('')
    return modelList

# **`Generate Generic Variables`**

In [None]:
features = ["num_chars", "num_chars_total", 
            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
            "vader neu", "vader compound", 
            "num_hashtags", "num_mentions", 
            "num_urls", 
            "is_retweet", "num_media",
            "is_verified", 
            "caps_ratio"]

rf_params = {
    'random_state': 1337,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
}

#Training data withholding all heldout events for general models
generalTraining = train_data(df, 'eventID', list(heldout_events.values())) #Check this
print(generalTraining.shape)

# **`Generate postPriority Models`**

In [None]:
prioLabel = 'postPriority'
prioModel = RandomForestClassifier(**modelParameters)

#generate general model
genPrioModel = generate_model(generalTraining, features, prioLabel, prioModel)

#generate event specific models
specPrioModels = generate_model_by_events(df, features, prioLabel, prioModel)

# **`Generate postCategories Models`**

In [None]:
catModel = MultiOutputClassifier(RandomForestClassifier(**modelParameters))

#generate general model
catLabel = ['postCategories']
genCatModel =  generate_model(generalTraining, features, catLabel, catModel)

#generate event specific models
specCatModels = generate_model_by_events(df, features, catLabel, catModel)

In [None]:
def save_model(model, filename):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename):
    model = pickle.load(open(filename, 'rb'))
    return model

In [None]:
#Save postPriority models
baseFilename = 'PrioModel.pkl'
save_model(genPrioModel, 'gen' + baseFilename)

for event in eventTypes:
    save_model(specPrioModels[event], event + baseFilename)

In [None]:
#Save postCategories models
baseFilename = 'CatModel.pkl'
save_model(genCatModel, 'gen' + baseFilename)

for event in eventTypes:
    save_model(specCatModels[event], event + baseFilename)

# **`Test All Models`**

In [None]:
#Load All Models
genPrioModel
specPrioModels
genCatModel
specCatModels

In [None]:
def test_prio_model(data, features, target, model):
    X_test = data[features]
    y_test = data[target]
    y_infer_local = model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = model.score(X_test, y_test)
    
    #print("\tAccuracy:", local_score)
    #print("\tF1:", local_f1)
    return [local_score, local_f1]

In [None]:
#Test postPriority models
prioDF = pd.DataFrame(columns=['Event', 'genAccuracy', 'genF1', 'specAccuracy', 'specF1'])
for event in eventTypes:
    eventDf = df.loc[df['eventType']==event]
    testDf = test_data(eventDf, 'eventID', [heldout_events[event]])
    genScores = test_prio_model(testDf, features, prioLabel, genPrioModel)
    specScores = test_prio_model(testDf, features, specPrioModels[event], genPrioModel)
    prioDF = prioDF.append({'Event':event, 'genAccuracy':genScores[0], 'genF1':genScores[1],
               'specAccuracy':specScores[0], 'specF1':specScores[1]}, ignore_index=True)
prioDF

In [None]:
def test_cat_model(data, features, target, model):
    X_test = data[features]
    y_test = data[target]
    y_infer_local = model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = model.score(X_test, y_test)
    
    #print("\tAccuracy:", local_score)
    #print("\tF1:", local_f1)
    return [local_score, local_f1]

In [None]:
#Test postCategories models
catDF = pd.DataFrame(columns=['Event', 'genAccuracy', 'genF1', 'specAccuracy', 'specF1'])
for event in eventTypes:
    eventDf = df.loc[df['eventType']==event]
    testDf = test_data(eventDf, 'eventID', [heldout_events[event]])
    genScores = test_cat_model(testDf, features, catLabel, genCatModel)
    specScores = test_cat_model(testDf, features, catLabel, specCatModels[event])
    catDF = catDF.append({'Event':event, 'genAccuracy':genScores[0], 'genF1':genScores[1],
               'specAccuracy':specScores[0], 'specF1':specScores[1]}, ignore_index=True)
catDF