In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.base import clone
import json
from pandas.io.json import json_normalize

from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir
from sklearn.preprocessing import MultiLabelBinarizer

import random



In [2]:
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS
import re
import itertools 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/g/g01107/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/g/g01107/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
baseJsonPath = '/research/cbuntain/datasets/twitter/trecis/allEventJson/'
baseFileName = 'TRECIS-CTIT-H-*.json.gz' #* is to be replaced

In [4]:
#Opens the data and reorganizes it by tweets instead of by events
with open('./Trec_data/labeled_by_event.json') as f:
    js = json.load(f)
df = json_normalize(data=js['events'], record_path='tweets')
df.to_json('./Trec_data/labeled.json', orient='records', lines=True)

df['postID'] = df['postID'].astype(int)

df

Unnamed: 0,eventID,eventType,postID,postCategories,postPriority,postText
0,stormJorge2020,typhoon,1231307896362807298,[Irrelevant],Low,Flood Warning: River Severn at Hanley Castle a...
1,stormJorge2020,typhoon,1231569665043976192,[Irrelevant],Low,Flood Warning: River Ouse at Naburn Lock 12:46...
2,stormJorge2020,typhoon,1232264304067477504,[Irrelevant],Low,Our Assistant Director of Care and Support kin...
3,stormJorge2020,typhoon,1232070602778959872,[Irrelevant],Low,@hollywills please can you help support @HopeR...
4,stormJorge2020,typhoon,1232648900105965568,[Irrelevant],Low,Police order 'immediate evacuation' in Shropsh...
...,...,...,...,...,...,...
91510,whaleyBridgeCollapse2020,flood,1155430270457323520,[Irrelevant],Low,Flood Alert: River Ecclesbourne in Derbyshire ...
91511,whaleyBridgeCollapse2020,flood,1156993824591417346,"[Location, EmergingThreats, MultimediaShare, N...",High,Dam at Whaley Bridge in Peak District threaten...
91512,whaleyBridgeCollapse2020,flood,1157020257388769280,"[ThirdPartyObservation, Location, MultimediaSh...",Low,Floods in Whaley Bridge today.\nhttps://t.co/7...
91513,whaleyBridgeCollapse2020,flood,1156926115069485056,"[MovePeople, ThirdPartyObservation, Location, ...",Critical,Evacuation of Whaley Bridge | Derbyshire Const...


In [None]:
#Run through each json file and see if it relates
recordCount = 123
full_df = pd.DataFrame()

for i in tqdm(range(1, recordCount), position=0, leave=True):
    filepath = baseJsonPath + baseFileName.replace('*', str(i).zfill(3))
    jsonDF = pd.read_json(filepath, orient='records',lines=True)
    temp_df = pd.merge(df, jsonDF, how='inner', left_on='postID', right_on='id')
    full_df = pd.concat([full_df, temp_df], ignore_index=True)
    

full_df

In [None]:
#Save full_df
full_df.to_json('./Trec_data/full_comb_labeled.json', orient='records', lines=True)

In [5]:
full_df = pd.read_json("./Trec_data/full_comb_labeled.json", orient='records',lines=True)
full_df

Unnamed: 0,contributors,contributorsIDs,coordinates,createdAt,created_at,currentUserRetweetId,displayTextRangeEnd,displayTextRangeStart,display_text_range,entities,...,source,symbolEntities,text,timestamp_ms,topic,truncated,urlEntities,user,userMentionEntities,withheld_in_countries
0,,,,,2012-06-12 02:07:42,,,,,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,"The High Park fire west of Fort Collins, #CO h...",NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'United States', 'default_profile...",,
1,,,"{'type': 'Point', 'coordinates': [-105.1348135...",,2012-06-26 22:22:32,,,,,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",...,"<a href=""http://twitter.com/download/iphone"" r...",,Pic of the #FlagstaffFire in boulder from our ...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Erie, Co', 'default_profile': Tr...",,
2,,,,,2012-06-11 22:34:58,,,,,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,RT @CBSDenver: The copter is on the way to the...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Boulder, Colorado', 'default_pro...",,
3,,,,,2012-06-24 23:05:37,,,,,"{'symbols': [], 'urls': [], 'hashtags': [], 'u...",...,"<a href=""http://tapbots.com/tweetbot"" rel=""nof...",,I have it on good authority that most of Color...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Louisville, KY', 'default_profil...",,
4,,,,,2012-06-26 22:29:11,,,,,"{'symbols': [], 'urls': [], 'hashtags': [{'tex...",...,"<a href=""http://twitter.com/download/iphone"" r...",,RT @ColoradoRapids: Photo of #FlagStaffFire in...,NaT,TRECIS-CTIT-H-001,0.0,,"{'location': 'Denver', 'default_profile': Fals...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101108,,,,,2020-03-04 17:18:19,,,,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,Putnam County: Cookeville area tornado victims...,2020-03-04 17:18:19.342,TRECIS-CTIT-H-120,0.0,,"{'id': 825386050243682311, 'id_str': '82538605...",,
101109,,,,,2020-03-04 15:53:11,,,,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",...,"<a href=""http://twitter.com/download/iphone"" r...",,All Bad: Guy Gets Stuck 375 Ft High In A Crane...,2020-03-04 15:53:11.876,TRECIS-CTIT-H-120,0.0,,"{'id': 1192317385719648256, 'id_str': '1192317...",,
101110,,,,,2020-03-04 18:36:46,,,,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,"Earth Catastrophe Resources, Nashville Storm f...",2020-03-04 18:36:46.758,TRECIS-CTIT-H-120,0.0,,"{'id': 1198906372659830784, 'id_str': '1198906...",,
101111,,,,,2020-03-04 18:36:20,,,,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,"Earth Catastrophe Resources, Nashville Storm f...",2020-03-04 18:36:20.643,TRECIS-CTIT-H-120,0.0,,"{'id': 1198892581553393664, 'id_str': '1198892...",,


In [7]:
eventTypes = df['eventType'].unique()
for event in eventTypes:
    eventsDF = df.loc[df['eventType']==event]['eventID'].unique()
    events = full_df.loc[full_df['eventType']==event]['eventID'].unique()
    print(event + ': ' + str(eventsDF.size) + ' : ' + str(events.size))
    print(eventsDF)
    print(events)

typhoon: 13 : 13
['stormJorge2020' 'hurricaneBarry2020' 'joplinTornado2011'
 'cycloneKenneth2019' 'tropicalStormCristobal2020' 'stormDennis2020'
 'typhoonLekima2020' 'typhoonKrosa2020' 'typhoonHagupit2014'
 'stormCiara2020' 'typhoonPablo2012' 'typhoonYolanda2013'
 'hurricaneFlorence2018']
['typhoonPablo2012' 'typhoonYolanda2013' 'joplinTornado2011'
 'typhoonHagupit2014' 'hurricaneFlorence2018' 'cycloneKenneth2019'
 'hurricaneBarry2020' 'typhoonLekima2020' 'typhoonKrosa2020'
 'stormCiara2020' 'stormDennis2020' 'stormJorge2020'
 'tropicalStormCristobal2020']
storm: 2 : 2
['tennesseeDerecho2020' 'southeastTornadoOutbreak2020']
['southeastTornadoOutbreak2020' 'tennesseeDerecho2020']
wildfire: 5 : 5
['australiaBushfire2013' 'albertaWildfires2019' 'fireColorado2012'
 'siberianWildfires2020' 'fireYMM2016']
['fireColorado2012' 'australiaBushfire2013' 'fireYMM2016'
 'albertaWildfires2019' 'siberianWildfires2020']
covid: 10 : 10
['covidNYC2020' 'covidHouston2020' 'covidDC2020' 'covidSeattle2020'

In [18]:
nullTexts = ['postText', 'text', 'full_text']

for textVar in nullTexts:
    print(textVar)
    print()
    events = full_df.loc[pd.isna(full_df[textVar])].eventID.unique()
    for event in events:
        total = full_df.loc[pd.isna(full_df[textVar]) & (full_df['eventID']==event)].size
        partial = full_df.loc[pd.isna(full_df[textVar]) & (full_df['eventID']==event)].size
        percentage = 100*partial/total
        print(event + ': ' + str(percentage) +'%')
    print()


postText

fireColorado2012: 100.0%
costaRicaEarthquake2012: 100.0%
floodColorado2013: 100.0%
typhoonPablo2012: 100.0%
laAirportShooting2013: 100.0%
westTexasExplosion2013: 100.0%
guatemalaEarthquake2012: 100.0%
italyEarthquakes2012: 100.0%
philipinnesFloods2012: 100.0%
albertaFloods2013: 100.0%
australiaBushfire2013: 100.0%
bostonBombings2013: 100.0%
manilaFloods2013: 100.0%
queenslandFloods2013: 100.0%
typhoonYolanda2013: 100.0%
joplinTornado2011: 100.0%
chileEarthquake2014: 100.0%
typhoonHagupit2014: 100.0%
nepalEarthquake2015: 100.0%
flSchoolShooting2018: 100.0%
parisAttacks2015: 100.0%

text

covidNYC2020: 100.0%
covidDC2020: 100.0%
covidWashingtonState2020: 100.0%
houstonExplosion2020: 100.0%
texasAMCommerceShooting2020: 100.0%
southeastTornadoOutbreak2020: 100.0%
stormCiara2020: 100.0%
stormDennis2020: 100.0%
virraMallHostageSituation2020: 100.0%
stormJorge2020: 100.0%
tennesseeTornadoOutbreak2020: 100.0%
tennesseeDerecho2020: 100.0%
edenvilleDamFailure2020: 100.0%
sanFranciscoPi

In [32]:
nullTexts = ['postText', 'text', 'full_text']
nullTextDf = pd.DataFrame(columns = nullTexts)
for textVar in nullTexts:
    events = full_df.loc[pd.isna(full_df[textVar])].eventID.unique()
    for event in events:
        if event not in nullTextDf.index:
            nullTextDf.loc[event] = {'postText':False, 'text':False, 'full_text':False}
        nullTextDf.loc[event][textVar] = True

print(nullTextDf.to_string())

                               postText   text full_text
fireColorado2012                   True  False      True
costaRicaEarthquake2012            True  False      True
floodColorado2013                  True  False      True
typhoonPablo2012                   True  False      True
laAirportShooting2013              True  False      True
westTexasExplosion2013             True  False      True
guatemalaEarthquake2012            True  False      True
italyEarthquakes2012               True  False      True
philipinnesFloods2012              True  False      True
albertaFloods2013                  True  False      True
australiaBushfire2013              True  False      True
bostonBombings2013                 True  False      True
manilaFloods2013                   True  False      True
queenslandFloods2013               True  False      True
typhoonYolanda2013                 True  False      True
joplinTornado2011                  True  False      True
chileEarthquake2014            

In [None]:
full_df.postID.duplicated().sum()

In [None]:
full_df=full_df.drop_duplicates(subset=['postID'])
full_df

In [None]:
#Combine text into PostText
full_df['postText'] = full_df['postText'].fillna(full_df['text'])
full_df['postText']

In [None]:
#Remove null text values
full_df = full_df[(~full_df["postText"].isnull())]
full_df

In [None]:
full_df = full_df.reset_index()

In [None]:
#Save full_df
full_df.to_json('./Trec_data/combined_labeled.json', orient='records', lines=True)

In [None]:
full_df = pd.read_json("./Trec_data/combined_labeled.json", orient='records',lines=True)
full_df

In [6]:
full_df.columns.tolist()

['contributors',
 'contributorsIDs',
 'coordinates',
 'createdAt',
 'created_at',
 'currentUserRetweetId',
 'displayTextRangeEnd',
 'displayTextRangeStart',
 'display_text_range',
 'entities',
 'eventID',
 'eventType',
 'extended_entities',
 'extended_tweet',
 'favoriteCount',
 'favorite_count',
 'favorited',
 'filter_level',
 'full_text',
 'geo',
 'geoLocation',
 'hashtagEntities',
 'id',
 'id_str',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'isFavorited',
 'isPossiblySensitive',
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 'place',
 'possibly_sensitive',
 'possibly_sensitive_appealable',
 'postCategories',
 'postID',
 'postPriority',
 'postText',
 'quote_count',
 'quotedStatus',
 'quotedStatusId',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',


In [None]:
#Reorder columns
cols = ['eventID',
 'eventType',
 'postID',
 'postCategories',
 'postPriority',
 'postText',
 'contributors',
 'contributorsIDs',
 'coordinates',
 'createdAt',
 'created_at',
 'currentUserRetweetId',
 'displayTextRangeEnd',
 'displayTextRangeStart',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favoriteCount',
 'favorite_count',
 'favorited',
 'filter_level',
 'full_text',
 'geo',
 'geoLocation',
 'hashtagEntities',
 'id',
 'id_str',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'isFavorited',
 'isPossiblySensitive',
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 'place',
 'possibly_sensitive',
 'possibly_sensitive_appealable',
 'quote_count',
 'quotedStatus',
 'quotedStatusId',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'reply_count',
 'retweetCount',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'scopes',
 'source',
 'symbolEntities',
 'text',
 'timestamp_ms',
 'topic',
 'truncated',
 'urlEntities',
 'user',
 'userMentionEntities',
 'withheld_in_countries'
    ]
full_df = full_df[cols]
full_df

In [None]:
eventTypes = df['eventType'].unique()
for event in eventTypes:
    eventsDF = df.loc[df['eventType']==event]['eventID'].unique()
    events = full_df.loc[full_df['eventType']==event]['eventID'].unique()
    print(event + ': ' + str(eventsDF.size) + ' : ' + str(events.size))
    print(eventsDF)
    print(events)

In [None]:
#Save organized full_df, This df also goes to analyse_data notebook
full_df.to_json('./Trec_data/org_combined_labeled.json', orient='records', lines=True)

In [None]:
full_df = pd.read_json("./Trec_data/org_combined_labeled.json", orient='records',lines=True)
full_df

In [None]:
#Remove certain non-neccessary columns
cols = ['eventID',
 'eventType',
 'postID',
 'postCategories',
 'postPriority',
 'postText',
 #'contributors',
 #'contributorsIDs',
 #'coordinates', #Not useful for
 #'createdAt', #When there is two of these it tends to be a list of dtype and of objects
 #'created_at',
 #'currentUserRetweetId',
 #'displayTextRangeEnd', #Unsure how these three variables could be useful
 #'displayTextRangeStart',
 #'display_text_range',
 'entities',
 'extended_entities',
 #'extended_tweet',
 'favorite_count',
 #'filter_level', #Low variability
 #'geo',
 #'geoLocation',
 'hashtagEntities',
 'inReplyToScreenName',
 'inReplyToStatusId',
 'inReplyToUserId',
 #'in_reply_to_screen_name',
 #'in_reply_to_status_id',
 #'in_reply_to_status_id_str',
 #'in_reply_to_user_id',
 #'in_reply_to_user_id_str',
 'isFavorited',
 #'isPossiblySensitive', #Applies to 14 rows
 'isRetweeted',
 'isTruncated',
 'is_quote_status',
 'lang',
 'matching_rules',
 'mediaEntities',
 'metadata',
 #'place', #Could be really useful, containes a lot of location data, for now disabled because its too much info and also low reliability
 'possibly_sensitive', #Might have value
 #'possibly_sensitive_appealable',
 #'quote_count',
 #'quotedStatus',
 #'quotedStatusId',
 #'quoted_status',
 #'quoted_status_id',
 #'quoted_status_id_str',
 #'quoted_status_permalink',
 #'reply_count',
 'retweet_count', #Utilize to replace if retweeted, maybe use to double check retweeted_status
 #'retweeted',
 'retweeted_status',
 #'scopes', #Only NaN values
 #'source', No clue how this may be utilized
 #'symbolEntities',
 #'timestamp_ms', #Time series bad
 #'topic', #Same as eventID pretty much, I think
 'truncated', #Useful, mark NaN as 0 and switch to boolean?
 #'urlEntities', #Has information regarding urls
 'user' #Very useful but there is a lot of associated data
 #'userMentionEntities', #Useful somehow i'm sure just don't know right now
 #'withheld_in_countries' #Only relates to 4 tweets
       ]
full_df = full_df[cols]
full_df

In [None]:
full_df.to_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)

In [None]:
full_df = pd.read_json("./Trec_data/Feature_Reduction.json", orient='records',lines=True)
full_df

In [None]:
# Generate Additional Features
local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    #return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))
    return(parsed_text)


other_features_names = ["num_chars", "num_chars_total", 
                        "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos",
                        "vader neu", "vader compound", 
                        "num_hashtags", "num_mentions", 
                        "num_urls", 
                        "is_retweet", "num_media",
                        "is_verified", 
                        "caps_ratio"]

## Taken from Davidson et al.
def other_features(data):
    new_df = pd.DataFrame()
    
    for index, tweet in tqdm(data.iterrows(), total=data.shape[0], position=0, leave=True):
        #print(tweet)
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        tweet_text = tweet["postText"]

        ##SENTIMENT
        sentiment = sentiment_analyzer.polarity_scores(tweet_text)

        words = local_tokenizer.tokenize(tweet_text) #Get text only

        num_chars = sum(len(w) for w in words) #num chars in words
        num_chars_total = len(tweet_text)
        num_terms = len(tweet_text.split())
        num_words = len(words)
        num_unique_terms = len(set([x.lower() for x in words]))

        caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
        caps_ratio = caps_count / num_chars_total

        parsed_text = count_twitter_objs(tweet_text) #Count #, @, and http://
        twitter_objs = (parsed_text.count('URLHERE'), parsed_text.count('MENTIONHERE'), parsed_text.count('HASHTAGHERE'))
        num_media = 0
        if "entities" in tweet and tweet["entities"] != None and "media" in tweet["entities"]:
                num_media = len(tweet["entities"]["media"])
        retweet = 0
        if "rt" in words or "retweeted_status" in tweet:
            retweet = 1
        

        has_place = 1 if "coordinates" in tweet else 0

        author = tweet["user"]
        is_verified = 1 if ("verified" in author and author["verified"]) else 0
        
        features = [num_chars, num_chars_total, num_terms, num_words, num_unique_terms,
                    sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1], twitter_objs[0],
                    retweet, num_media, is_verified, caps_ratio]
        
        
        features = [round(x, 4) for x in features]
        
        new_row = {}
        for feature, name in zip(features, other_features_names):
            new_row[name] = feature
        
        #new_row['postID'] = tweet['postID']
        
        new_df = new_df.append(new_row, ignore_index=True)
        
        
    return new_df


In [None]:
#Creating extra features
other_ftr_df = other_features(full_df)
other_ftr_df

In [None]:
#Combine the extra features into original database
featured_df = pd.concat([full_df, other_ftr_df], axis=1)
featured_df

In [None]:
# generate sentence embedding
class SBERT:

    def __init__(self, lang="en"):
        from sentence_transformers import SentenceTransformer
        self.name = "SBERT"
        if lang == "fr":
            self.model = SentenceTransformer(
                "/home/bmazoyer/Dev/pytorch_bert/output/sts_fr_long_multilingual_bert-2019-10-01_15-07-03")
        elif lang == "en": #Does this need to be changed?
            self.model = SentenceTransformer(
                # "bert-large-nli-stsb-mean-tokens"
                "roberta-large-nli-stsb-mean-tokens"
            )
# roberta-large-nli-stsb-mean-tokens
    def compute_vectors(self, data):
        data["postText"] = data.postText.str.slice(0, 500)
        vectors = np.array(self.model.encode(data.postText.tolist()))
        return vectors

In [None]:
sbert=SBERT()

In [None]:
#Ignore for now
v=sbert.compute_vectors(featured_df) #Takes ages
featured_df['vectorized_text']=[item for item in tqdm(v)]
featured_df['vectorized_text']

In [None]:
#Save new df
featured_df.to_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)

In [None]:
featured_df = pd.read_json("./Trec_data/Features_Labeled.json", orient='records',lines=True)
featured_df

In [None]:
#Change priority target, might not be needed
priorityDict = {'Unknown':0.0, 'Low':0.25, 'Medium':.5, 'High':.75, 'Critical':1}
featured_df['regression_priority']=[priorityDict[item] for item in featured_df['postPriority']]
featured_df[['regression_priority', 'postPriority']]

In [None]:
#Change categories to be multiple boolean columns
#categories = featured_df['postCategories'].explode().unique()
#cat_df = pd.DataFrame(index=np.arange(featured_df.shape[0]))
#for cat in categories:
#    name = 'bool_' + cat
#    cat_df[name] = False
#for index, row in tqdm(featured_df.iterrows(), total=data.shape[0], position=0, leave=True):
#    for cat in row['postCategories']:
#        cat_df.loc[index]['bool_' + cat] = True
#featured_df = pd.concat([featured_df, cat_df], axis=1)

In [None]:
#Switch from multiple boolean columns to MultiLabelBinarizer
categories = MultiLabelBinarizer().fit_transform(featured_df['postCategories']) #this should yield 25 in second dimension
print(type(categories))
categories

In [None]:
#Merge with featured_df
featured_df['sparseCategories'] = categories.tolist()
featured_df['sparseCategories']

In [None]:
#Save new df
featured_df.to_json("./Trec_data/Preprocessed_labelled.json", orient='records',lines=True)

In [None]:
#Load new df to skip above processes
featured_df = pd.read_json("./Trec_data/Preprocessed_labelled.json", orient='records',lines=True)
featured_df