In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.base import clone

from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir
from sklearn.preprocessing import MultiLabelBinarizer

import random



In [2]:
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS
import re
import itertools 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/g/g01107/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/g/g01107/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
full_df = pd.read_json("./Trec_data/org_combined_labeled.json", orient='records',lines=True)
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,contributors,contributorsIDs,coordinates,createdAt,...,source,symbolEntities,text,timestamp_ms,topic,truncated,urlEntities,user,userMentionEntities,withheld_in_countries
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...",,,,,...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,"The High Park fire west of Fort Collins, #CO h...",NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'United States', 'default_profile...",,
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,,,"{'type': 'Point', 'coordinates': [-105.1348135...",,...,"<a href=""http://twitter.com/download/iphone"" r...",,Pic of the #FlagstaffFire in boulder from our ...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Erie, Co', 'default_profile': Tr...",,
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,,,,,...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,RT @CBSDenver: The copter is on the way to the...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Boulder, Colorado', 'default_pro...",,
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,,,,,...,"<a href=""http://tapbots.com/tweetbot"" rel=""nof...",,I have it on good authority that most of Color...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Louisville, KY', 'default_profil...",,
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,,,,,...,"<a href=""http://twitter.com/download/iphone"" r...",,RT @ColoradoRapids: Photo of #FlagStaffFire in...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Denver', 'default_profile': Fals...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,,,,,...,"<a href=""http://twitter.com/download/android"" ...",,,NaT,TRECIS-CTIT-H-075,0.0,,"{'id': 783147530884227072, 'id_str': '78314753...",,
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,,,,,...,"<a href=""http://twitter.com/download/android"" ...",,,NaT,TRECIS-CTIT-H-075,0.0,,"{'id': 58133627, 'id_str': '58133627', 'name':...",,
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,,,,,...,"<a href=""http://www.facebook.com/twitter"" rel=...",,Um. Jon? You get jolted awake early this morni...,2020-01-24 13:46:01.874,TRECIS-CTIT-H-076,0.0,,"{'id': 20885534, 'id_str': '20885534', 'name':...",,
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...",,,,,...,"<a href=""https://www.smartnews.com/"" rel=""nofo...",,"2 dead, 1 hurt in shooting at college residenc...",2020-02-03 19:48:36.584,TRECIS-CTIT-H-078,0.0,,"{'id': 282685004, 'id_str': '282685004', 'name...",,


In [4]:
#Remove certain non-neccessary columns
cols = ['eventID',
 'eventType',
 'postID',
 'postCategories',
 'postPriority',
 'postText',
 #'contributors',
 #'contributorsIDs',
 #'coordinates', #Not useful for
 #'createdAt', #When there is two of these it tends to be a list of dtype and of objects
 #'created_at',
 #'currentUserRetweetId',
 #'displayTextRangeEnd', #Unsure how these three variables could be useful
 #'displayTextRangeStart',
 #'display_text_range',
 'entities',
 'extended_entities',
 #'extended_tweet',
 'favorite_count',
 #'filter_level', #Low variability
 #'geo',
 #'geoLocation',
 'hashtagEntities',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 #'in_reply_to_screen_name',
 #'in_reply_to_status_id',
 #'in_reply_to_status_id_str',
 #'in_reply_to_user_id',
 #'in_reply_to_user_id_str',
 'isFavorited',
 #'isPossiblySensitive', #Applies to 14 rows
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 #'place', #Could be really useful, containes a lot of location data, for now disabled because its too much info and also low reliability
 'possibly_sensitive', #Might have value
 #'possibly_sensitive_appealable',
 #'quote_count',
 #'quotedStatus',
 #'quotedStatusId',
 #'quoted_status',
 #'quoted_status_id',
 #'quoted_status_id_str',
 #'quoted_status_permalink',
 #'reply_count',
 'retweet_count', #Utilize to replace if retweeted, maybe use to double check retweeted_status
 #'retweeted',
 'retweeted_status',
 #'scopes', #Only NaN values
 #'source', No clue how this may be utilized
 #'symbolEntities',
 #'timestamp_ms', #Time series bad
 #'topic', #Same as eventID pretty much, I think
 'truncated', #Useful, mark NaN as 0 and switch to boolean?
 #'urlEntities', #Has information regarding urls
 'user' #Very useful but there is a lot of associated data
 #'userMentionEntities', #Useful somehow i'm sure just don't know right now
 #'withheld_in_countries' #Only relates to 4 tweets
       ]
full_df = full_df[cols]
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,is_quote_status,lang,matching_rules,mediaEntities,metadata,possibly_sensitive,retweet_count,retweeted_status,truncated,user
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,13.0,,0.0,"{'location': 'United States', 'default_profile..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,0.0,en,,,,0.0,0.0,,0.0,"{'location': 'Erie, Co', 'default_profile': Tr..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,2.0,"{'contributors': None, 'text': 'The copter is ...",0.0,"{'location': 'Boulder, Colorado', 'default_pro..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,0.0,en,,,,,0.0,,0.0,"{'location': 'Louisville, KY', 'default_profil..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,11.0,"{'retweeted': False, 'in_reply_to_screen_name'...",0.0,"{'location': 'Denver', 'default_profile': Fals..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,0.0,en,,,"{'iso_language_code': 'en', 'result_type': 're...",0.0,0.0,,0.0,"{'id': 783147530884227072, 'id_str': '78314753..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,0.0,en,,,"{'iso_language_code': 'en', 'result_type': 're...",0.0,0.0,,0.0,"{'id': 58133627, 'id_str': '58133627', 'name':..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 1115966485465209294, '...",,,0.0,0.0,,0.0,"{'id': 20885534, 'id_str': '20885534', 'name':..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 7193928916782307852, '...",,,0.0,0.0,,0.0,"{'id': 282685004, 'id_str': '282685004', 'name..."


In [5]:
full_df.to_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)

In [6]:
full_df = pd.read_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,is_quote_status,lang,matching_rules,mediaEntities,metadata,possibly_sensitive,retweet_count,retweeted_status,truncated,user
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,13.0,,0.0,"{'location': 'United States', 'default_profile..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,0.0,en,,,,0.0,0.0,,0.0,"{'location': 'Erie, Co', 'default_profile': Tr..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,2.0,"{'contributors': None, 'text': 'The copter is ...",0.0,"{'location': 'Boulder, Colorado', 'default_pro..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,0.0,en,,,,,0.0,,0.0,"{'location': 'Louisville, KY', 'default_profil..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,11.0,"{'retweeted': False, 'in_reply_to_screen_name'...",0.0,"{'location': 'Denver', 'default_profile': Fals..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,0.0,en,,,"{'iso_language_code': 'en', 'result_type': 're...",0.0,0.0,,0.0,"{'id': 783147530884227072, 'id_str': '78314753..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,0.0,en,,,"{'iso_language_code': 'en', 'result_type': 're...",0.0,0.0,,0.0,"{'id': 58133627, 'id_str': '58133627', 'name':..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 1115966485465209294, '...",,,0.0,0.0,,0.0,"{'id': 20885534, 'id_str': '20885534', 'name':..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 7193928916782307852, '...",,,0.0,0.0,,0.0,"{'id': 282685004, 'id_str': '282685004', 'name..."


In [7]:
# Generate Additional Features
local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    #return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))
    return(parsed_text)


other_features_names = ["num_chars", "num_chars_total", 
                        "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
                        "vader neu", "vader compound", 
                        "num_hashtags", "num_mentions", 
                        "num_urls", 
                        "is_retweet", "num_media",
                        "is_verified", 
                        "caps_ratio"]

## Taken from Davidson et al.
def other_features(data):
    new_df = pd.DataFrame()
    
    for index, tweet in tqdm(data.iterrows(), total=data.shape[0], position=0, leave=True):
        #print(tweet)
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        tweet_text = tweet["postText"]

        ##SENTIMENT
        sentiment = sentiment_analyzer.polarity_scores(tweet_text)

        words = local_tokenizer.tokenize(tweet_text) #Get text only

        num_chars = sum(len(w) for w in words) #num chars in words
        num_chars_total = len(tweet_text)
        num_terms = len(tweet_text.split())
        num_words = len(words)
        num_unique_terms = len(set([x.lower() for x in words]))

        caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
        caps_ratio = caps_count / num_chars_total

        parsed_text = count_twitter_objs(tweet_text) #Count #, @, and http://
        twitter_objs = (parsed_text.count('URLHERE'), parsed_text.count('MENTIONHERE'), parsed_text.count('HASHTAGHERE'))
        num_media = 0
        if "entities" in tweet and tweet["entities"] != None and "media" in tweet["entities"]:
                num_media = len(tweet["entities"]["media"])
        retweet = 0
        if "rt" in words or "retweeted_status" in tweet:
            retweet = 1
        

        has_place = 1 if "coordinates" in tweet else 0

        author = tweet["user"]
        is_verified = 1 if ("verified" in author and author["verified"]) else 0
        
        features = [num_chars, num_chars_total, num_terms, num_words, num_unique_terms,
                    sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1], twitter_objs[0],
                    retweet, num_media, is_verified, caps_ratio]
        
        
        features = [round(x, 4) for x in features]
        
        new_row = {}
        for feature, name in zip(features, other_features_names):
            new_row[name] = feature
        
        #new_row['postID'] = tweet['postID']
        
        new_df = new_df.append(new_row, ignore_index=True)
        
        
    return new_df


In [8]:
#Creating extra features
other_ftr_df = other_features(full_df)
other_ftr_df

100%|██████████| 72473/72473 [03:24<00:00, 354.05it/s]


Unnamed: 0,caps_ratio,is_retweet,is_verified,num_chars,num_chars_total,num_hashtags,num_media,num_mentions,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos
0,0.0735,1.0,1.0,112.0,136.0,4.0,0.0,0.0,25.0,28.0,0.0,29.0,-0.3400,0.091,0.909,0.000
1,0.1268,1.0,0.0,62.0,71.0,1.0,1.0,0.0,10.0,11.0,1.0,11.0,0.0000,0.000,1.000,0.000
2,0.0929,1.0,0.0,116.0,140.0,1.0,0.0,2.0,25.0,25.0,0.0,28.0,0.0000,0.000,1.000,0.000
3,0.0429,1.0,0.0,115.0,140.0,0.0,0.0,0.0,26.0,29.0,0.0,32.0,0.0552,0.177,0.610,0.214
4,0.1214,1.0,0.0,117.0,140.0,1.0,0.0,2.0,23.0,25.0,0.0,26.0,0.0000,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,0.0588,1.0,0.0,112.0,136.0,0.0,1.0,0.0,24.0,21.0,1.0,24.0,-0.3412,0.099,0.901,0.000
72469,0.1048,1.0,0.0,90.0,105.0,0.0,0.0,0.0,16.0,16.0,1.0,18.0,-0.0516,0.173,0.663,0.163
72470,0.1233,1.0,0.0,64.0,73.0,0.0,0.0,0.0,10.0,12.0,1.0,13.0,0.0000,0.000,1.000,0.000
72471,0.0575,1.0,0.0,76.0,87.0,1.0,0.0,0.0,12.0,13.0,1.0,13.0,-0.8271,0.490,0.510,0.000


In [9]:
#Combine the extra features into original database
featured_df = pd.concat([full_df, other_ftr_df], axis=1)
featured_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_media,num_mentions,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,0.0,25.0,28.0,0.0,29.0,-0.3400,0.091,0.909,0.000
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,1.0,0.0,10.0,11.0,1.0,11.0,0.0000,0.000,1.000,0.000
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,2.0,25.0,25.0,0.0,28.0,0.0000,0.000,1.000,0.000
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,0.0,0.0,26.0,29.0,0.0,32.0,0.0552,0.177,0.610,0.214
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,2.0,23.0,25.0,0.0,26.0,0.0000,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,1.0,0.0,24.0,21.0,1.0,24.0,-0.3412,0.099,0.901,0.000
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,0.0,0.0,16.0,16.0,1.0,18.0,-0.0516,0.173,0.663,0.163
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0.0,0.0,10.0,12.0,1.0,13.0,0.0000,0.000,1.000,0.000
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,0.0,0.0,12.0,13.0,1.0,13.0,-0.8271,0.490,0.510,0.000


In [10]:
# generate sentence embedding
class SBERT:

    def __init__(self, lang="en"):
        from sentence_transformers import SentenceTransformer
        self.name = "SBERT"
        if lang == "fr":
            self.model = SentenceTransformer(
                "/home/bmazoyer/Dev/pytorch_bert/output/sts_fr_long_multilingual_bert-2019-10-01_15-07-03")
        elif lang == "en": #Does this need to be changed?
            self.model = SentenceTransformer(
                # "bert-large-nli-stsb-mean-tokens"
                "roberta-large-nli-stsb-mean-tokens"
            )
# roberta-large-nli-stsb-mean-tokens
    def compute_vectors(self, data):
        data["postText"] = data.postText.str.slice(0, 500)
        vectors = np.array(self.model.encode(data.postText.tolist()))
        return vectors

In [11]:
sbert=SBERT()

In [12]:
#Ignore for now
v=sbert.compute_vectors(featured_df) #Takes ages
featured_df['vectorized_text']=[item for item in tqdm(v)]
featured_df['vectorized_text']

Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x2b07362a0a50>
Traceback (most recent call last):
  File "/afs/cad/linux/anaconda3.8/anaconda/lib/python3.8/site-packages/tqdm/notebook.py", line 220, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


KeyboardInterrupt: 

In [13]:
#Save new df
featured_df.to_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)

In [14]:
featured_df = pd.read_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)
featured_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_media,num_mentions,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0,0,25,28,0,29,-0.3400,0.091,0.909,0.000
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,1,0,10,11,1,11,0.0000,0.000,1.000,0.000
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0,2,25,25,0,28,0.0000,0.000,1.000,0.000
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,0,0,26,29,0,32,0.0552,0.177,0.610,0.214
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0,2,23,25,0,26,0.0000,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,1,0,24,21,1,24,-0.3412,0.099,0.901,0.000
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,0,0,16,16,1,18,-0.0516,0.173,0.663,0.163
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0,0,10,12,1,13,0.0000,0.000,1.000,0.000
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,0,0,12,13,1,13,-0.8271,0.490,0.510,0.000


In [15]:
#Change priority target, might not be needed
priorityDict = {'Unknown':0.0, 'Low':0.25, 'Medium':.5, 'High':.75, 'Critical':1}
featured_df['regression_priority']=[priorityDict[item] for item in featured_df['postPriority']]
featured_df[['regression_priority', 'postPriority']]

Unnamed: 0,regression_priority,postPriority
0,0.25,Low
1,0.25,Low
2,0.00,Unknown
3,0.25,Low
4,0.50,Medium
...,...,...
72468,0.25,Low
72469,0.25,Low
72470,0.25,Low
72471,0.25,Low


In [16]:
#Change categories to be multiple boolean columns
#categories = featured_df['postCategories'].explode().unique()
#cat_df = pd.DataFrame(index=np.arange(featured_df.shape[0]))
#for cat in categories:
#    name = 'bool_' + cat
#    cat_df[name] = False
#for index, row in tqdm(featured_df.iterrows(), total=data.shape[0], position=0, leave=True):
#    for cat in row['postCategories']:
#        cat_df.loc[index]['bool_' + cat] = True
#featured_df = pd.concat([featured_df, cat_df], axis=1)

In [17]:
#Switch from multiple boolean columns to MultiLabelBinarizer
categories = MultiLabelBinarizer().fit_transform(featured_df['postCategories']) #this should yield 25 in second dimension
print(type(categories))
categories

<class 'numpy.ndarray'>


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 1]])

In [30]:
#Merge with featured_df
featured_df['sparseCategories'] = categories.tolist()
featured_df['sparseCategories']

0        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
1        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
2        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
3        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
                               ...                        
72468    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
72469    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
72470    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
72471    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ...
72472    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
Name: sparseCategories, Length: 72473, dtype: object

In [31]:
#Save new df
featured_df.to_json("./Trec_data/Preprocessed_labelled.json", orient='records',lines=True)

In [32]:
#Load new df to skip above processes
featured_df = pd.read_json("./Trec_data/Preprocessed_labelled.json", orient='records',lines=True)
featured_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText,entities,extended_entities,favorite_count,hashtagEntities,...,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos,regression_priority,sparseCategories
0,fireColorado2012,wildfire,212365530391252993,[Factoid],Low,"The High Park fire west of Fort Collins, #CO h...","{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,28,0,29,-0.3400,0.091,0.909,0.000,0.25,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,fireColorado2012,wildfire,217744670753689603,[MultimediaShare],Low,Pic of the #FlagstaffFire in boulder from our ...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...","{'media': [{'sizes': {'small': {'w': 510, 'res...",0.0,,...,10,11,1,11,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,25,25,0,28,0.0000,0.000,1.000,0.000,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,fireColorado2012,wildfire,217030749856088066,[],Low,I have it on good authority that most of Color...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,26,29,0,32,0.0552,0.177,0.610,0.214,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,23,25,0,26,0.0000,0.000,1.000,0.000,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72468,covidNewZealand2020,covid,1296006183178784768,"[FirstPartyObservation, MultimediaShare, Advice]",Low,Its personal choice to wear a mask\nDon't put ...,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1296006181022916608, 'id_str...",19.0,,...,24,21,1,24,-0.3412,0.099,0.901,0.000,0.25,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
72469,covidNewZealand2020,covid,1296214212046237698,[Irrelevant],Low,The Government did the right thing. They shoul...,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1.0,,...,16,16,1,18,-0.0516,0.173,0.663,0.163,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
72470,houstonExplosion2020,explosion,1220704310520094720,"[ThirdPartyObservation, Location, EmergingThre...",Low,Um. Jon? You get jolted awake early this morni...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,10,12,1,13,0.0000,0.000,1.000,0.000,0.25,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
72471,texasAMCommerceShooting2020,shooting,1224419435043123200,"[ThirdPartyObservation, Location, MultimediaSh...",Low,"2 dead, 1 hurt in shooting at college residenc...","{'hashtags': [{'text': 'SmartNews', 'indices':...",,0.0,,...,12,13,1,13,-0.8271,0.490,0.510,0.000,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ..."
