In [87]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation

from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

import random

In [88]:
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS
import re
import itertools 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/g/g01115/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/g/g01115/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [89]:
baseJsonPath = '/research/cbuntain/datasets/twitter/trecis/allEventJson/'
baseFileName = 'TRECIS-CTIT-H-*.json.gz' #* is to be replaced

In [90]:
#Dataframe  generated in analye-data.ipynb
df = pd.read_json("./Trec_data/labeled.json", orient='records',lines=True)
df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText
0,stormJorge2020,typhoon,1231307896362807296,[Irrelevant],Low,Flood Warning: River Severn at Hanley Castle a...
1,stormJorge2020,typhoon,1231569665043976192,[Irrelevant],Low,Flood Warning: River Ouse at Naburn Lock 12:46...
2,stormJorge2020,typhoon,1232264304067477504,[Irrelevant],Low,Our Assistant Director of Care and Support kin...
3,stormJorge2020,typhoon,1232070602778959872,[Irrelevant],Low,@hollywills please can you help support @HopeR...
4,stormJorge2020,typhoon,1232648900105965568,[Irrelevant],Low,Police order 'immediate evacuation' in Shropsh...
...,...,...,...,...,...,...
91510,whaleyBridgeCollapse2020,flood,1155430270457323520,[Irrelevant],Low,Flood Alert: River Ecclesbourne in Derbyshire ...
91511,whaleyBridgeCollapse2020,flood,1156993824591417344,"[Location, EmergingThreats, MultimediaShare, N...",High,Dam at Whaley Bridge in Peak District threaten...
91512,whaleyBridgeCollapse2020,flood,1157020257388769280,"[ThirdPartyObservation, Location, MultimediaSh...",Low,Floods in Whaley Bridge today.\nhttps://t.co/7...
91513,whaleyBridgeCollapse2020,flood,1156926115069485056,"[MovePeople, ThirdPartyObservation, Location, ...",Critical,Evacuation of Whaley Bridge | Derbyshire Const...


In [91]:
#Run through each json file and see if it relates
recordCount = 123
n = 6
nCount = 0
full_df = pd.DataFrame()

for i in tqdm(range(1, recordCount), position=0, leave=True):
    filepath = baseJsonPath + baseFileName.replace('*', str(i).zfill(3))
    json = pd.read_json(filepath, orient='records',lines=True)
    temp_df = pd.merge(df, json, how='inner', left_on='postID', right_on='id')
    full_df = pd.concat([full_df, temp_df], ignore_index=True)
    if recordCount / n * nCount < i:
        print(full_df.shape)
        nCount += 1
    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  full_df = pd.concat([full_df, temp_df], ignore_index=True)
  2%|▏         | 2/122 [00:00<00:50,  2.38it/s]

(153, 34)


 17%|█▋        | 21/122 [00:16<03:15,  1.94s/it]

(14518, 40)


 34%|███▍      | 42/122 [00:25<00:25,  3.10it/s]

(31437, 65)


 51%|█████     | 62/122 [02:03<07:11,  7.19s/it]

(47352, 65)


 68%|██████▊   | 83/122 [04:58<03:16,  5.05s/it]

(55230, 71)


 84%|████████▍ | 103/122 [09:29<03:55, 12.39s/it]

(55860, 71)


100%|██████████| 122/122 [14:10<00:00,  6.97s/it]


In [92]:
full_df.columns

Index(['contributors', 'contributorsIDs', 'coordinates', 'createdAt',
       'created_at', 'currentUserRetweetId', 'displayTextRangeEnd',
       'displayTextRangeStart', 'display_text_range', 'entities', 'eventID',
       'eventType', 'extended_entities', 'extended_tweet', 'favoriteCount',
       'favorite_count', 'favorited', 'filter_level', 'full_text', 'geo',
       'geoLocation', 'hashtagEntities', 'id', 'id_str', 'inReplyToScreenName',
       'inReplyToStatusId', 'inReplyToUserId', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'isFavorited',
       'isPossiblySensitive', 'isRetweeted', 'isTruncated', 'is_quote_status',
       'lang', 'matching_rules', 'mediaEntities', 'metadata', 'place',
       'possibly_sensitive', 'possibly_sensitive_appealable', 'postCategories',
       'postID', 'postPriority', 'postText', 'quote_count', 'quotedStatus',
       'quotedStatusId', 'quoted_status', 

In [93]:
full_df.loc[pd.isna(full_df['postText'])]['eventID'].explode().unique()

array(['fireColorado2012', 'costaRicaEarthquake2012', 'floodColorado2013',
       'typhoonPablo2012', 'laAirportShooting2013',
       'westTexasExplosion2013', 'guatemalaEarthquake2012',
       'italyEarthquakes2012', 'philipinnesFloods2012',
       'albertaFloods2013', 'australiaBushfire2013', 'bostonBombings2013',
       'manilaFloods2013', 'queenslandFloods2013', 'typhoonYolanda2013',
       'joplinTornado2011', 'chileEarthquake2014', 'typhoonHagupit2014',
       'nepalEarthquake2015', 'flSchoolShooting2018', 'parisAttacks2015'],
      dtype=object)

In [94]:
full_df.loc[pd.isna(full_df['eventType'])]

Unnamed: 0,contributors,contributorsIDs,coordinates,createdAt,created_at,currentUserRetweetId,displayTextRangeEnd,displayTextRangeStart,display_text_range,entities,...,source,symbolEntities,text,timestamp_ms,topic,truncated,urlEntities,user,userMentionEntities,withheld_in_countries


In [95]:
full_df.columns.tolist()

['contributors',
 'contributorsIDs',
 'coordinates',
 'createdAt',
 'created_at',
 'currentUserRetweetId',
 'displayTextRangeEnd',
 'displayTextRangeStart',
 'display_text_range',
 'entities',
 'eventID',
 'eventType',
 'extended_entities',
 'extended_tweet',
 'favoriteCount',
 'favorite_count',
 'favorited',
 'filter_level',
 'full_text',
 'geo',
 'geoLocation',
 'hashtagEntities',
 'id',
 'id_str',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'isFavorited',
 'isPossiblySensitive',
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 'place',
 'possibly_sensitive',
 'possibly_sensitive_appealable',
 'postCategories',
 'postID',
 'postPriority',
 'postText',
 'quote_count',
 'quotedStatus',
 'quotedStatusId',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',


In [97]:
#Reorder columns and remove certain non-neccessary columns
cols = ['eventID',
 'eventType',
 'postID',
 'postCategories',
 'postPriority',
 'text',
 'contributors',
 'contributorsIDs',
 'coordinates', #Not useful for
 'createdAt', #When there is two of these it tends to be a list of dtype and of objects
 'created_at',
 'currentUserRetweetId',
 'displayTextRangeEnd', #Unsure how these three variables could be useful
 'displayTextRangeStart',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'filter_level', #Low variability
 'geo',
 'geoLocation',
 'hashtagEntities',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'isFavorited',
 'isPossiblySensitive', #Applies to 14 rows
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 'place', #Could be really useful, containes a lot of location data, for now disabled because its too much info and also low reliability
 'possibly_sensitive', #Might have value
 'possibly_sensitive_appealable', 
 #'processed_text',
 'quote_count',
 'quotedStatus',
 'quotedStatusId',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'reply_count',
 'retweet_count', #Utilize to replace if retweeted, maybe use to double check retweeted_status
 'retweeted',
 'retweeted_status',
 'scopes', #Only NaN values
 'source', #No clue how this may be utilized
 'symbolEntities',
 'timestamp_ms', #Time series bad
 'topic', #Same as eventID pretty much, I think
 'truncated', #Useful, mark NaN as 0 and switch to boolean?
 'urlEntities', #Has information regarding urls
 'user', #Very useful but there is a lot of associated data
 'withheld_in_countries' #Only relates to 4 tweets
       ]
full_df = full_df[cols]
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,text,contributors,contributorsIDs,coordinates,createdAt,...,retweeted_status,scopes,source,symbolEntities,timestamp_ms,topic,truncated,urlEntities,user,withheld_in_countries
0,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,,,,,...,"{'contributors': None, 'text': 'The copter is ...",,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,NaT,TRECIS-CTIT-H-001,False,,"{'location': 'Boulder, Colorado', 'default_pro...",
1,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,,,,,...,"{'retweeted': False, 'in_reply_to_screen_name'...",,"<a href=""http://twitter.com/download/iphone"" r...",,NaT,TRECIS-CTIT-H-001,False,,"{'location': 'Denver', 'default_profile': Fals...",
2,fireColorado2012,wildfire,217732012314861568,[FirstPartyObservation],Medium,2 wildfires in Boulder County. We can see smok...,,,,,...,,,"<a href=""http://twitter.com/download/android"" ...",,NaT,TRECIS-CTIT-H-001,False,,"{'location': 'Beautiful Colorado', 'default_pr...",
3,fireColorado2012,wildfire,216961334129078272,[Discussion],Low,RT @Jon_G3: Seeing 1/3 of Colorado on fire mak...,,,,,...,"{'contributors': None, 'text': 'Seeing 1/3 of ...",,"<a href=""http://www.tweetcaster.com"" rel=""nofo...",,NaT,TRECIS-CTIT-H-001,False,,"{'location': 'Salem, OR', 'default_profile': F...",
4,fireColorado2012,wildfire,212552860590813184,[MultimediaShare],Medium,RT @dhorning11: RT @LarimerCounty: #HighParkFi...,,,,,...,,,"<a href=""https://about.twitter.com/products/tw...",,NaT,TRECIS-CTIT-H-001,False,,"{'location': 'San Francisco', 'default_profile...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56025,tennesseeTornadoOutbreak2020,tornado,1235236359310368768,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Hottest Google Search in 31.2 hrs. Nashville t...,,,,,...,,,"<a href=""https://trendsmetadata.com"" rel=""nofo...",,2020-03-04 16:11:12.298,TRECIS-CTIT-H-120,True,,"{'id': 290889806, 'id_str': '290889806', 'name...",
56026,tennesseeTornadoOutbreak2020,tornado,1235337290144239616,"[ThirdPartyObservation, Location, MultimediaSh...",Low,A live report is next on the Nashville tornado...,,,,,...,,,"<a href=""http://twitter.com/download/iphone"" r...",,2020-03-04 22:52:16.084,TRECIS-CTIT-H-120,False,,"{'id': 579146884, 'id_str': '579146884', 'name...",
56027,tennesseeTornadoOutbreak2020,tornado,1235258820139638784,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Officials are still cleaning up after tornadoe...,,,,,...,,,"<a href=""http://www.socastdigital.com"" rel=""no...",,2020-03-04 17:40:27.377,TRECIS-CTIT-H-120,False,,"{'id': 829719163, 'id_str': '829719163', 'name...",
56028,tennesseeTornadoOutbreak2020,tornado,1235253249957126144,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Putnam County: Cookeville area tornado victims...,,,,,...,,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,2020-03-04 17:18:19.342,TRECIS-CTIT-H-120,False,,"{'id': 825386050243682311, 'id_str': '82538605...",


In [98]:
full_df.to_json("./Trec_data/Full_Labelled.json", orient='records',lines=True)

In [99]:
full_df = pd.read_json("./Trec_data/Full_Labelled.json", orient='records',lines=True)
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,text,contributors,contributorsIDs,coordinates,createdAt,...,retweeted_status,scopes,source,symbolEntities,timestamp_ms,topic,truncated,urlEntities,user,withheld_in_countries
0,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,,,,,...,"{'contributors': None, 'text': 'The copter is ...",,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Boulder, Colorado', 'default_pro...",
1,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,,,,,...,"{'retweeted': False, 'in_reply_to_screen_name'...",,"<a href=""http://twitter.com/download/iphone"" r...",,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Denver', 'default_profile': Fals...",
2,fireColorado2012,wildfire,217732012314861568,[FirstPartyObservation],Medium,2 wildfires in Boulder County. We can see smok...,,,,,...,,,"<a href=""http://twitter.com/download/android"" ...",,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Beautiful Colorado', 'default_pr...",
3,fireColorado2012,wildfire,216961334129078272,[Discussion],Low,RT @Jon_G3: Seeing 1/3 of Colorado on fire mak...,,,,,...,"{'contributors': None, 'text': 'Seeing 1/3 of ...",,"<a href=""http://www.tweetcaster.com"" rel=""nofo...",,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Salem, OR', 'default_profile': F...",
4,fireColorado2012,wildfire,212552860590813184,[MultimediaShare],Medium,RT @dhorning11: RT @LarimerCounty: #HighParkFi...,,,,,...,,,"<a href=""https://about.twitter.com/products/tw...",,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'San Francisco', 'default_profile...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56025,tennesseeTornadoOutbreak2020,tornado,1235236359310368768,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Hottest Google Search in 31.2 hrs. Nashville t...,,,,,...,,,"<a href=""https://trendsmetadata.com"" rel=""nofo...",,2020-03-04 16:11:12.298,TRECIS-CTIT-H-120,1.0,,"{'id': 290889806, 'id_str': '290889806', 'name...",
56026,tennesseeTornadoOutbreak2020,tornado,1235337290144239616,"[ThirdPartyObservation, Location, MultimediaSh...",Low,A live report is next on the Nashville tornado...,,,,,...,,,"<a href=""http://twitter.com/download/iphone"" r...",,2020-03-04 22:52:16.084,TRECIS-CTIT-H-120,0.0,,"{'id': 579146884, 'id_str': '579146884', 'name...",
56027,tennesseeTornadoOutbreak2020,tornado,1235258820139638784,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Officials are still cleaning up after tornadoe...,,,,,...,,,"<a href=""http://www.socastdigital.com"" rel=""no...",,2020-03-04 17:40:27.377,TRECIS-CTIT-H-120,0.0,,"{'id': 829719163, 'id_str': '829719163', 'name...",
56028,tennesseeTornadoOutbreak2020,tornado,1235253249957126144,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Putnam County: Cookeville area tornado victims...,,,,,...,,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,2020-03-04 17:18:19.342,TRECIS-CTIT-H-120,0.0,,"{'id': 825386050243682311, 'id_str': '82538605...",


In [100]:
#Useful for checking features for usefulness

column = 'text'
#print(full_df[column].dtype)
if full_df[column].dtype != 'object':
    print(full_df[column].explode().unique())
    print(full_df[column])
else:
    print(full_df.loc[full_df[column]== full_df[column]][column])
full_df.loc[full_df[column]== None]
#full_df[column][52869]

0        RT @CBSDenver: The copter is on the way to the...
1        RT @ColoradoRapids: Photo of #FlagStaffFire in...
2        2 wildfires in Boulder County. We can see smok...
3        RT @Jon_G3: Seeing 1/3 of Colorado on fire mak...
4        RT @dhorning11: RT @LarimerCounty: #HighParkFi...
                               ...                        
56025    Hottest Google Search in 31.2 hrs. Nashville t...
56026    A live report is next on the Nashville tornado...
56027    Officials are still cleaning up after tornadoe...
56028    Putnam County: Cookeville area tornado victims...
56029    All Bad: Guy Gets Stuck 375 Ft High In A Crane...
Name: text, Length: 38076, dtype: object


Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,text,contributors,contributorsIDs,coordinates,createdAt,...,retweeted_status,scopes,source,symbolEntities,timestamp_ms,topic,truncated,urlEntities,user,withheld_in_countries


In [101]:
#Reorder columns and remove certain non-neccessary columns
cols = ['eventID',
 'eventType',
 'postID',
 'postCategories',
 'postPriority',
 'text',
 #'contributors',
 #'contributorsIDs',
 #'coordinates', #Not useful for
 #'createdAt', #When there is two of these it tends to be a list of dtype and of objects
 #'created_at',
 #'currentUserRetweetId',
 #'displayTextRangeEnd', #Unsure how these three variables could be useful
 #'displayTextRangeStart',
 #'display_text_range',
 'entities',
 'extended_entities',
 #'extended_tweet',
 'favorite_count',
 #'filter_level', #Low variability
 #'geo',
 #'geoLocation',
 'hashtagEntities',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 #'in_reply_to_screen_name',
 #'in_reply_to_status_id',
 #'in_reply_to_status_id_str',
 #'in_reply_to_user_id',
 #'in_reply_to_user_id_str',
 'isFavorited',
 #'isPossiblySensitive', #Applies to 14 rows
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 #'place', #Could be really useful, containes a lot of location data, for now disabled because its too much info and also low reliability
 'possibly_sensitive', #Might have value
 #'possibly_sensitive_appealable',
# 'processed_text',
 #'quote_count',
 #'quotedStatus',
 #'quotedStatusId',
 #'quoted_status',
 #'quoted_status_id',
 #'quoted_status_id_str',
 #'quoted_status_permalink',
 #'reply_count',
 'retweet_count', #Utilize to replace if retweeted, maybe use to double check retweeted_status
 #'retweeted',
 'retweeted_status',
 #'scopes', #Only NaN values
 #'source', No clue how this may be utilized
 #'symbolEntities',
 #'timestamp_ms', #Time series bad
 #'topic', #Same as eventID pretty much, I think
 'truncated', #Useful, mark NaN as 0 and switch to boolean?
 #'urlEntities', #Has information regarding urls
 'user' #Very useful but there is a lot of associated data
 #'userMentionEntities', #Useful somehow i'm sure just don't know right now
 #'withheld_in_countries' #Only relates to 4 tweets
       ]
full_df = full_df[cols]
full_df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,text,entities,extended_entities,favorite_count,hashtagEntities,...,is_quote_status,lang,matching_rules,mediaEntities,metadata,possibly_sensitive,retweet_count,retweeted_status,truncated,user
0,fireColorado2012,wildfire,212311994286620672,[MultimediaShare],Unknown,RT @CBSDenver: The copter is on the way to the...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,2.0,"{'contributors': None, 'text': 'The copter is ...",0.0,"{'location': 'Boulder, Colorado', 'default_pro..."
1,fireColorado2012,wildfire,217746356842926080,[MultimediaShare],Medium,RT @ColoradoRapids: Photo of #FlagStaffFire in...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,11.0,"{'retweeted': False, 'in_reply_to_screen_name'...",0.0,"{'location': 'Denver', 'default_profile': Fals..."
2,fireColorado2012,wildfire,217732012314861568,[FirstPartyObservation],Medium,2 wildfires in Boulder County. We can see smok...,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",,0.0,,...,0.0,en,,,,,0.0,,0.0,"{'location': 'Beautiful Colorado', 'default_pr..."
3,fireColorado2012,wildfire,216961334129078272,[Discussion],Low,RT @Jon_G3: Seeing 1/3 of Colorado on fire mak...,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",,0.0,,...,0.0,en,,,,,7.0,"{'contributors': None, 'text': 'Seeing 1/3 of ...",0.0,"{'location': 'Salem, OR', 'default_profile': F..."
4,fireColorado2012,wildfire,212552860590813184,[MultimediaShare],Medium,RT @dhorning11: RT @LarimerCounty: #HighParkFi...,"{'symbols': [], 'urls': [{'expanded_url': 'htt...",,0.0,,...,0.0,en,,,,0.0,1.0,,0.0,"{'location': 'San Francisco', 'default_profile..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56025,tennesseeTornadoOutbreak2020,tornado,1235236359310368768,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Hottest Google Search in 31.2 hrs. Nashville t...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 386668493812155987, 'i...",,,0.0,0.0,,1.0,"{'id': 290889806, 'id_str': '290889806', 'name..."
56026,tennesseeTornadoOutbreak2020,tornado,1235337290144239616,"[ThirdPartyObservation, Location, MultimediaSh...",Low,A live report is next on the Nashville tornado...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 386668493812155987, 'i...",,,,0.0,,0.0,"{'id': 579146884, 'id_str': '579146884', 'name..."
56027,tennesseeTornadoOutbreak2020,tornado,1235258820139638784,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Officials are still cleaning up after tornadoe...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 386668493812155987, 'i...",,,,0.0,,0.0,"{'id': 829719163, 'id_str': '829719163', 'name..."
56028,tennesseeTornadoOutbreak2020,tornado,1235253249957126144,"[ThirdPartyObservation, Weather, Location, Mul...",Low,Putnam County: Cookeville area tornado victims...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,0.0,,...,0.0,en,"[{'tag': 'terms', 'id': 386668493812155987, 'i...",,,0.0,0.0,,0.0,"{'id': 825386050243682311, 'id_str': '82538605..."


In [102]:
#Remove null text values
full_df = full_df[(~full_df["text"].isnull())]


In [103]:
full_df.to_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)

In [104]:
full_df = pd.read_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)


In [105]:
# Generate Additional Features
local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    #return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))
    return(parsed_text)


other_features_names = ["num_chars", "num_chars_total", 
                        "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
                        "vader neu", "vader compound", 
                        "num_hashtags", "num_mentions", 
                        "num_urls", 
                        "is_retweet", "num_media",
                        "is_verified", 
                        "caps_ratio"]

## Taken from Davidson et al.
def other_features(data):
    new_df = pd.DataFrame()
    
    for index, tweet in tqdm(data.iterrows(), total=data.shape[0], position=0, leave=True):
        #print(tweet)
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        tweet_text = tweet["text"]

        ##SENTIMENT
        sentiment = sentiment_analyzer.polarity_scores(tweet_text)

        words = local_tokenizer.tokenize(tweet_text) #Get text only

        num_chars = sum(len(w) for w in words) #num chars in words
        num_chars_total = len(tweet_text)
        num_terms = len(tweet_text.split())
        num_words = len(words)
        num_unique_terms = len(set([x.lower() for x in words]))

        caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
        caps_ratio = caps_count / num_chars_total

        parsed_text = count_twitter_objs(tweet_text) #Count #, @, and http://
        twitter_objs = (parsed_text.count('URLHERE'), parsed_text.count('MENTIONHERE'), parsed_text.count('HASHTAGHERE'))
        num_media = 0
        if "entities" in tweet and tweet["entities"] != None and "media" in tweet["entities"]:
                num_media = len(tweet["entities"]["media"])
        retweet = 0
        if "rt" in words or "retweeted_status" in tweet:
            retweet = 1
        

        has_place = 1 if "coordinates" in tweet else 0

        author = tweet["user"]
        is_verified = 1 if ("verified" in author and author["verified"]) else 0
        
        features = [num_chars, num_chars_total, num_terms, num_words, num_unique_terms,
                    sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1], twitter_objs[0],
                    retweet, num_media, is_verified, caps_ratio]
        
        
        features = [round(x, 4) for x in features]
        
        new_row = {}
        for feature, name in zip(features, other_features_names):
            new_row[name] = feature
        
        #new_row['postID'] = tweet['postID']
        
        new_df = new_df.append(new_row, ignore_index=True)
        
        
    return new_df


In [106]:
#Creating extra features
other_ftr_df = other_features(full_df)
other_ftr_df

100%|██████████| 38076/38076 [01:22<00:00, 461.01it/s]


Unnamed: 0,caps_ratio,is_retweet,is_verified,num_chars,num_chars_total,num_hashtags,num_media,num_mentions,num_terms,num_unique_words,num_urls,num_words,vader compound,vader neg,vader neu,vader pos
0,0.0929,1.0,0.0,116.0,140.0,1.0,0.0,2.0,25.0,25.0,0.0,28.0,0.0000,0.000,1.000,0.00
1,0.1214,1.0,0.0,117.0,140.0,1.0,0.0,2.0,23.0,25.0,0.0,26.0,0.0000,0.000,1.000,0.00
2,0.0492,1.0,0.0,50.0,61.0,0.0,0.0,0.0,12.0,13.0,0.0,14.0,0.0000,0.000,1.000,0.00
3,0.0667,1.0,0.0,114.0,135.0,1.0,0.0,1.0,22.0,22.0,0.0,23.0,-0.6124,0.306,0.563,0.13
4,0.1273,1.0,0.0,96.0,110.0,1.0,0.0,2.0,15.0,17.0,1.0,20.0,0.0000,0.000,1.000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38071,0.0714,1.0,0.0,125.0,140.0,0.0,0.0,0.0,16.0,14.0,1.0,22.0,0.0000,0.000,1.000,0.00
38072,0.1111,1.0,1.0,44.0,54.0,0.0,0.0,1.0,11.0,11.0,0.0,11.0,0.0000,0.000,1.000,0.00
38073,0.0565,1.0,0.0,105.0,124.0,0.0,0.0,0.0,20.0,21.0,0.0,22.0,0.0000,0.000,1.000,0.00
38074,0.1111,1.0,0.0,74.0,81.0,0.0,0.0,0.0,8.0,9.0,1.0,9.0,-0.3182,0.247,0.753,0.00


In [107]:
#Combine the extra features into original database
featured_df = pd.concat([full_df, other_ftr_df], axis=1)

In [110]:
featured_df.columns

Index(['eventID', 'eventType', 'postID', 'postCategories', 'postPriority',
       'text', 'entities', 'extended_entities', 'favorite_count',
       'hashtagEntities', 'inReplyToScreenName', 'inReplyToStatusId',
       'inReplyToUserId', 'isFavorited', 'isRetweeted', 'isTruncated',
       'is_quote_status', 'lang', 'matching_rules', 'mediaEntities',
       'metadata', 'possibly_sensitive', 'retweet_count', 'retweeted_status',
       'truncated', 'user', 'caps_ratio', 'is_retweet', 'is_verified',
       'num_chars', 'num_chars_total', 'num_hashtags', 'num_media',
       'num_mentions', 'num_terms', 'num_unique_words', 'num_urls',
       'num_words', 'vader compound', 'vader neg', 'vader neu', 'vader pos'],
      dtype='object')

In [111]:
# generate sentnece embedding
class SBERT:

    def __init__(self, lang="en"):
        from sentence_transformers import SentenceTransformer
        self.name = "SBERT"
        if lang == "fr":
            self.model = SentenceTransformer(
                "/home/bmazoyer/Dev/pytorch_bert/output/sts_fr_long_multilingual_bert-2019-10-01_15-07-03")
        elif lang == "en": #Does this need to be changed?
            self.model = SentenceTransformer(
                # "bert-large-nli-stsb-mean-tokens"
                "roberta-large-nli-stsb-mean-tokens"
            )
# roberta-large-nli-stsb-mean-tokens
    def compute_vectors(self, data):
        data["processed"] = data.text.str.slice(0, 500)
        vectors = np.array(self.model.encode(data.text.tolist()))
        return vectors

In [112]:
sbert=SBERT()

In [None]:
v=sbert.compute_vectors(featured_df)
featured_df['vectorized_text']=[item for item in tqdm(v)]
featured_df['vectorized_text']

In [None]:
#Save new df
featured_df.to_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)

In [None]:
featured_df = pd.read_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)
featured_df

In [None]:
#Change priority target, might not be needed
priorityDict = {'Unknown':0.0, 'Low':0.25, 'Medium':.5, 'High':.75, 'Critical':1}
featured_df['regression_priority']=[priorityDict[item] for item in featured_df['postPriority']]
featured_df[['regression_priority', 'postPriority']]

In [None]:
#Change categories to be multiple boolean columns
categories = featured_df['postCategories'].explode().unique()
cat_df = pd.DataFrame()
for cat in categories:
    cat_df['bool_' + cat] = featured_df['postCategories'].isin(cat)
cat_df

In [None]:
#Save new df
featured_df.to_json("./Trec_data/preprocessed.json", orient='records',lines=True)

In [None]:
#Load new df to skip above processes
featured_df = pd.read_json("./Trec_data/preprocessed.json", orient='records',lines=True)
featured_df