In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import string
import gzip
import json
import re

In [3]:
import numpy as np
import pandas as pd

In [4]:
import scipy
from scipy import interpolate

In [5]:
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [9]:
tweet_id_map = {}
with open("../data/2019b-testing.json", "r") as in_file:
    for line in in_file:
        tweet_top = json.loads(line)
        tweet = json.loads(tweet_top["allProperties"]["srcjson"])
        tweet_id_map[np.int64(tweet["id"])] = tweet
        
with open("../data/2019b-training.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        tweet_id_map[np.int64(tweet["id"])] = tweet

print("Total Tweet Count:", len(tweet_id_map))

Total Tweet Count: 49147


In [11]:
lang_count_map = {}
for lang in [tweet["lang"] for tweet in tweet_id_map.values()]:
    lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
lang_count_map

{'en': 48776,
 'es': 60,
 'tl': 153,
 'it': 3,
 'fr': 13,
 'pt': 9,
 'und': 95,
 'hi': 10,
 'nl': 3,
 'ca': 2,
 'eu': 1,
 'in': 6,
 'ro': 3,
 'et': 1,
 'de': 1,
 'cs': 1,
 'ht': 1,
 'ja': 4,
 'th': 1,
 'cy': 1,
 'lt': 1,
 'pl': 1,
 'tr': 1}

In [26]:
link_count = 0
domain_count_map = {}
for entity in [tweet["entities"] for tweet in tweet_id_map.values() if "entities" in tweet]:

    if ( len(entity["urls"]) > 0 ):
        link_count += 1
        
        for url in entity["urls"]:
            domain = url["display_url"].lower().partition("/")[0]
            domain_count_map[domain] = domain_count_map.get(domain, 0) + 1
print("Tweets with Links:", link_count)
domain_count_map

Tweets with Links: 23073


{'dlvr.it': 697,
 'cyber4hits.com': 1,
 'gocyber.ca': 1,
 'athabascaadvocate.com': 3,
 'wp.me': 167,
 'ow.ly': 621,
 '511.alberta.ca': 3,
 'wildfire.alberta.ca': 12,
 'bit.ly': 2618,
 'twitter.com': 3674,
 'crweworld.com': 4,
 'wfp.to': 5,
 'trib.al': 205,
 'instagram.com': 617,
 'calgaryherald.com': 13,
 'cbc.ca': 158,
 'emergencyalert.alberta.ca': 27,
 'gobluecircle.com': 1,
 'rdcounty.ca': 1,
 'podcasts.apple.com': 1,
 'globalnews.ca': 88,
 'firesmoke.ca': 9,
 'nzzl.us': 3,
 'edmonton.ctvnews.ca': 42,
 'evnsocialnewswire.ch': 3,
 'edmontonjournal.com': 87,
 'ruq.us': 1,
 'kelownanow.com': 1,
 'spacevac.ca': 1,
 'sapphirepureclean.co.uk': 1,
 'alberta.ca': 10,
 'flic.kr': 9,
 'facebook.com': 207,
 'wsoe.org': 9,
 'ktvz.com': 3,
 'huffingtonpost.ca': 8,
 'greatfallstribune.com': 1,
 'a.msn.com': 94,
 'boysbygirls.co.uk': 2,
 'todayville.com': 2,
 'tmz.com': 3,
 'thestar.com': 16,
 'torstar.co': 8,
 'go.usa.gov': 12,
 'mingooland.com': 2,
 'albertafirebans.ca': 2,
 'ibc.ca': 5,
 'thewe

In [20]:
img_count = 0
for entity in [tweet["entities"] for tweet in tweet_id_map.values() if "entities" in tweet]:

    if ( "media" in entity ):
#         print(entity["media"])
        img_count += 1
print("Tweets with Images:", img_count)

Tweets with Images: 6149


In [23]:
entity

{'symbols': [],
 'urls': [{'expanded_url': 'http://CyG-NewsAgent.net/NewsAd.php?url=http://bit.ly/VPxmU3',
   'indices': [65, 87],
   'display_url': 'CyG-NewsAgent.net/NewsAd.php?urlâ\x80Š',
   'url': 'http://t.co/9ccakAJjE3'}],
 'hashtags': [{'text': 'NewsAd', 'indices': [52, 59]}],
 'user_mentions': []}

In [11]:
tweet_sources_map = {}

for tweet in tweet_id_map.values():
    tweet_sources_map[tweet["source"]] = tweet_sources_map.get(tweet["source"], 0) + 1

print("Sources:", len(tweet_sources_map))
tweet_sources_index = {x:i+1 for i, x in enumerate([x for x in tweet_sources_map.keys()if tweet_sources_map[x] > 10])}

print("Mapped Sources:", len(tweet_sources_index))



Sources: 1011
Mapped Sources: 97


In [12]:
tweet_priority_map = {}
priority_df = pd.read_csv("tweet_to_priority.csv", dtype={"tweet_id": np.int64})
for row in priority_df.itertuples():
    tweet_id = row.tweet_id
    
    tweet_priority_map[tweet_id] = {
        "score": row.score_mean,
        "weight": 1.0 - row.score_std
    }

In [13]:
tweet_category_map = {}
category_df = pd.read_csv("tweet_to_category.csv")

cat_update_map = {
    "ContinuingNews": "News",
    "PastNews": "ContextualInformation",
    "KnownAlready": "OriginalEvent",
    "SignificantEventChange": "NewSubEvent",
}

category_df["category"] = category_df["category"].apply(lambda x: cat_update_map.get(x, x))

for category, tweets in category_df.groupby("category"):
    tweet_category_map[category] = list(tweets["tweet_id"])
    
# Deleted in 2019
del(tweet_category_map["Unknown"])

In [14]:
for category, tweet_ids in tweet_category_map.items():
    retrieved_count = sum([1 if np.int64(tid) in tweet_id_map else 0 in tweet_id_map for tid in tweet_ids])
    print("Category:", category)
    print("\tTweet Count:", len(tweet_ids), "Retrieved Fraction:", retrieved_count/len(tweet_ids))
    
    lang_count_map = {}
    for lang in [tweet_id_map[np.int64(tid)]["lang"] for tid in tweet_ids if int(tid) in tweet_id_map]:
        lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
    print("\t", str(lang_count_map))

Category: Advice
	Tweet Count: 1706 Retrieved Fraction: 1.0
	 {'en': 1705, 'und': 1}
Category: CleanUp
	Tweet Count: 135 Retrieved Fraction: 1.0
	 {'en': 135}
Category: ContextualInformation
	Tweet Count: 1610 Retrieved Fraction: 1.0
	 {'en': 1561, 'hi': 1, 'es': 39, 'pt': 2, 'it': 2, 'fr': 2, 'und': 2, 'tl': 1}
Category: Discussion
	Tweet Count: 2921 Retrieved Fraction: 1.0
	 {'en': 2878, 'und': 5, 'hi': 1, 'es': 27, 'pt': 3, 'it': 1, 'fr': 2, 'tl': 2, 'nl': 2}
Category: Donations
	Tweet Count: 894 Retrieved Fraction: 1.0
	 {'en': 894}
Category: EmergingThreats
	Tweet Count: 1192 Retrieved Fraction: 1.0
	 {'en': 1191, 'fr': 1}
Category: Factoid
	Tweet Count: 3525 Retrieved Fraction: 1.0
	 {'en': 3511, 'tl': 8, 'und': 2, 'ca': 1, 'es': 3}
Category: FirstPartyObservation
	Tweet Count: 4216 Retrieved Fraction: 1.0
	 {'en': 4211, 'hi': 2, 'und': 2, 'tl': 1}
Category: GoodsServices
	Tweet Count: 146 Retrieved Fraction: 1.0
	 {'en': 146}
Category: Hashtags
	Tweet Count: 7159 Retrieved Fract

In [15]:
# But first, read in stopwrods
enStop = stopwords.words('english')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop +\
    ["http", "https", "rt", "@", ":", "t.co", "co", "amp", "&amp;", "...", "\n", "\r"]
stopList.extend(string.punctuation)

In [16]:
# def tokenizer_wrapper(text):
#     return [t.lemma_ for t in nlp(text)]

local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

In [17]:
# Generate Additional Features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Taken from Davidson et al.
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    tweet_text = tweet["text"]
    
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet_text)
    
    words = local_tokenizer.tokenize(tweet_text) #Get text only
    
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet_text)
    num_terms = len(tweet_text.split())
    num_words = len(words)
    num_unique_terms = len(set([x.lower() for x in words]))
    
    caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
    caps_ratio = caps_count / num_chars_total
    
    twitter_objs = count_twitter_objs(tweet_text) #Count #, @, and http://
    num_media = 0
    if "entities" in tweet and "media" in tweet["entities"]:
        num_media = len(tweet["entities"]["media"])
    retweet = 0
    if "rt" in words or "retweeted_status" in tweet:
        retweet = 1
        
    has_place = 1 if "coordinates" in tweet else 0
        
    author = tweet["user"]
    is_verified = 1 if ("verified" in author and author["verified"]) else 0
    log_followers = 0
    if "followers_count" in author and author["followers_count"] > 0:
         log_followers = np.log(author["followers_count"])
    log_friends = 0
    if "friends_count" in author and author["friends_count"] > 0:
         log_followers = np.log(author["friends_count"])
    
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], 
                sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, num_media,
                is_verified, 
#                 log_followers, log_friends,
#                 has_place,
                caps_ratio,
               ]

    return [round(x, 4) for x in features]

other_features_names = ["num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos",
                        "vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", 
                        "num_urls", "is_retweet", "num_media",
                        "is_verified", 
#                         "log_followers", "log_friends",
#                         "has_place",
                        "caps_ratio",
                       ]

In [18]:
category_to_label = {c:i+1 for i, c in enumerate(tweet_category_map.keys()) if c != "Irrelevant"}
category_to_label["Irrelevant"] = 0

tweet_id_to_category = {}
for category, tweet_ids in tweet_category_map.items():
    if ( len(tweet_ids) < 5 ):
        print("Skipping category:", category)
        continue
        
    for tweet_id in tweet_ids:
        tweet_id_to_category[np.int64(tweet_id)] = category_to_label[category]
        

In [19]:
# tweet_pairs = [(tweet, tweet_id_to_category[tid]) 
#                for tid, tweet in tweet_id_map.items() if tid in tweet_id_to_category]

tweet_pairs = [(tweet_id_map[np.int64(tweet)], category_to_label[category]) 
               for category, tweet_ids in tweet_category_map.items() for tweet in tweet_ids]

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

y_data = np.array([tp[1] for tp in tweet_pairs])

In [20]:
print("Tweets with Categories:", y_data.shape[0])

Tweets with Categories: 63346


In [21]:
#Construct tfidf matrix and get relevant scores
vectorizer = joblib.load("../models/2013to2016_tfidf_vectorizer_20190109.pkl")



In [22]:
tfidf = vectorizer.transform(tweet_texts).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [23]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])

In [24]:
sources_ftrs = [tweet_sources_index.get(tweet["source"], 0) for tweet, _ in tweet_pairs]

source_ftr_onehot = []
for ftr in sources_ftrs:
    f = np.zeros(len(tweet_sources_index) + 1)
    f[ftr] = 1.0
    source_ftr_onehot.append(f)

sources_ftr_data = np.array(source_ftr_onehot)    

sources_ftr_data.shape

(63346, 98)

In [25]:
X_data = np.concatenate([
    tfidf, 
    other_ftr_data, 
    sources_ftr_data,
], axis=1)

print(X_data.shape, y_data.shape)

(63346, 10114) (63346,)


In [32]:
ftr_names_ = np.concatenate([
    np.array([x for x in vocab]), 
    other_features_names, 
    ["Unknown"] + list(tweet_sources_index.keys()),
])

In [26]:
print("Dictionary Word Count:", len(vocab))
print([x[0] for x in vocab.items()][-10:])

Dictionary Word Count: 10000
['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']


In [27]:
r_state = 1337

In [28]:
category_number_list = list(category_to_label.values())
d = len(category_number_list)

f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=5, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    # Get actual labels
    y_test = y_data[test]

    # Compute Precision-Recall 
    y_infer_local = [category_number_list[np.random.randint(0, d)] for smaple in y_test]
    f1_accum.append(f1_score(y_test, y_infer_local, average="macro"))
    
    accuracy_accum.append(sklearn.metrics.accuracy_score(y_test, y_infer_local))

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

Accuracy: 0.04170722897885452
F1: 0.03286180635130048


In [29]:
rf_params = {
    'n_estimators': 128, 
    "n_jobs": -1,
    'random_state': r_state,
    'class_weight': "balanced",
    'criterion': 'gini',
    'max_depth': 32,
    'max_features': 113,
    'min_samples_leaf': 2,
    'min_samples_split': 54,
}

nb_params = {
    'alpha': 0.01579145221181444,
    'binarize': 0.7316900686676242,
    'fit_prior': False
}

In [30]:
f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]
    y_train = y_data[train]
    
    X_test = X_data[test]
    y_test = y_data[test]

    # train
#     fitted_model = RandomForestClassifier(**rf_params)
    fitted_model = BernoulliNB(**nb_params)
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="macro")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Accuracy: 0.14450047273873307
	F1: 0.1267691676147095
	Accuracy: 0.14992905565189973
	F1: 0.1305630019851192
	Accuracy: 0.15894039735099338
	F1: 0.150074935113354
	Accuracy: 0.15092256741838828
	F1: 0.1328749929130375
	Accuracy: 0.19608771099542516
	F1: 0.14836075142113697
	Accuracy: 0.1670350489422166
	F1: 0.16925834448066449
	Accuracy: 0.16671934260429835
	F1: 0.14916061480819287
	Accuracy: 0.16553359683794466
	F1: 0.15552492118332595
	Accuracy: 0.1075098814229249
	F1: 0.11007156023681368
	Accuracy: 0.13348094259054247
	F1: 0.10624347308560496
Accuracy: 0.15406590165533668
F1: 0.13789017628419592


In [33]:
label_to_category = {j:i for i, j in category_to_label.items()}
for positive_category in category_number_list:
    local_y_data = [1 if y == positive_category else 0 for y in y_data]
    
    fitted_model = RandomForestClassifier(**rf_params)
    fitted_model.fit(X_data, local_y_data)
    
    weights = [(ftr_names_[idx], coef) 
               for idx, coef in enumerate(fitted_model.feature_importances_)]

    tops = sorted(weights, key=lambda x: x[1], reverse=True)[:10]
    
    print("Label:", label_to_category[positive_category])
    print("Score:", fitted_model.score(X_data, local_y_data))
    for token, weight in tops:
        print("\t", token, weight)

Label: Advice
Score: 0.8429577242446248
	 vader neg 0.03439873564865353
	 vader compound 0.03320672930789363
	 vader pos 0.025977430221543932
	 num_words 0.022666039368761142
	 num_chars 0.022366630841430427
	 num_unique_words 0.021612348301868933
	 num_chars_total 0.021036273390178054
	 num_terms 0.019911911583429472
	 vader neu 0.017514663870457827
	 philippines 0.01712507842688468
Label: CleanUp
Score: 0.9944590029362549
	 num_chars 0.03409871488662725
	 num_chars_total 0.03244046335624888
	 vader neu 0.02667578655181617
	 num_terms 0.023532252062691043
	 begins 0.023398190446503903
	 caps_ratio 0.02232172371782861
	 num_unique_words 0.021907634121341904
	 vader compound 0.02174514004076544
	 num_words 0.020815610852640897
	 damage 0.01991325163528457
Label: ContextualInformation
Score: 0.9395384081078522
	 school 0.08053748491842129
	 num_hashtags 0.06523070137524266
	 shooting 0.0643357336462351
	 florida 0.04907706841145317
	 num_chars_total 0.03612528163496518
	 cruz 0.026560912

Label: Volunteer
Score: 0.9926120039150065
	 help 0.04692925407193221
	 vader compound 0.045326814266867375
	 vader neg 0.03789196375679748
	 vader pos 0.031499479729352096
	 need 0.026886772603254484
	 caps_ratio 0.02647019835533224
	 num_mentions 0.024162957675057443
	 num_chars 0.023917752287824313
	 num_chars_total 0.02141937111909905
	 num_unique_words 0.021395286154220896
Label: Weather
Score: 0.872272913838285
	 philippines 0.0676530813277522
	 vader neu 0.03360040963110236
	 vader pos 0.0241171448968388
	 winds 0.021843531334293228
	 num_urls 0.02182602128537618
	 vader neg 0.0196630954726181
	 is_retweet 0.019174489534670117
	 caps_ratio 0.01906215572903833
	 vader compound 0.016818294503683655
	 shooting 0.016318234905852918
Label: Irrelevant
Score: 0.8392952988349699
	 num_hashtags 0.0591957174204993
	 num_urls 0.05590062571065893
	 dallas 0.04735509927492537
	 hurricane 0.0414714808980506
	 caps_ratio 0.028577519073931038
	 shooting 0.021828082476056863
	 is_verified 0.0191

In [34]:
def smapper(x):
    if ( x < 0.75 ):
        return 0
    else:
        return 1

y_data_regress = np.array([smapper(tweet_priority_map[np.int64(tp[0]["id"])]["score"]) 
                           for tp in tweet_pairs])
y_data_weights = np.array([tweet_priority_map[np.int64(tp[0]["id"])]["weight"] 
                           for tp in tweet_pairs])

In [35]:
score_accum = []
f1_accum = []
accuracy_accum = []

rf_priority_params = {
    'random_state': r_state,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
}

nb_priority_params = {
    'alpha': 0.05134305647695325,
    'binarize': 0.045909955637688404,
    'fit_prior': True,
}

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data_regress):

    X_train = X_data[train]
    y_train = y_data_regress[train]
    y_weight = y_data_weights[train]
    
    X_test = X_data[test]
    y_test = y_data_regress[test]

    # train
    print("\tFitting...")
#     fitted_model = sklearn.linear_model.LinearRegression(n_jobs=4)
#     fitted_model = sklearn.tree.DecisionTreeRegressor(random_state=r_state, max_depth=256)
    fitted_model = BernoulliNB(**nb_priority_params)
#     fitted_model = RandomForestClassifier(**rf_priority_params)
    fitted_model.fit(X_train, y_train, y_weight)

    # Compute score metrics
    r2_score = fitted_model.score(X_test, y_test)
    score_accum.append(r2_score)
    
    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    f1_accum.append(f1_score(y_test, y_infer_local, average="macro"))
    
    accuracy_accum.append(fitted_model.score(X_test, y_test))
    
    print("\t", r2_score)

print("Score (R^2):", np.mean(score_accum))
print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Fitting...
	 0.8737373737373737
	Fitting...
	 0.8442234848484849
	Fitting...
	 0.8803472770323599
	Fitting...
	 0.8871349644830308
	Fitting...
	 0.8976949794758446
	Fitting...
	 0.8425955162614461
	Fitting...
	 0.803283864856331
	Fitting...
	 0.8916956109883171
	Fitting...
	 0.9051152510262077
	Fitting...
	 0.8369119040101042
Score (R^2): 0.8662740226719501
Accuracy: 0.8662740226719501
F1: 0.7321632620234546


In [36]:
def random_search(X_data, y_data, clf, param_dist, n_iter_search=20, r_state=1337):
    # run randomized search
    random_search = RandomizedSearchCV(clf, 
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=10,
                                       scoring="f1_macro",
                                       random_state=r_state,
                                       verbose=2,
                                       n_jobs=16,
                                      )
    
    random_search.fit(X_data, y_data)

    return (random_search.best_score_, random_search.best_params_)

def model_eval_rf(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=r_state)
    
    # specify parameters and distributions to sample from
    param_dist = {
        "max_depth": [2, 4, 8, 16, 32, 64, 128, None],
        "max_features": scipy.stats.randint(1, 512),
        "min_samples_split": scipy.stats.randint(2, 512),
        "min_samples_leaf": scipy.stats.randint(2, 512),
        "criterion": ["gini", "entropy"],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

def model_eval_nb(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = BernoulliNB()
    
    # specify parameters and distributions to sample from
    param_dist = {
        "alpha": scipy.stats.uniform(),
        "binarize": scipy.stats.uniform(),
        "fit_prior": [True, False],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

In [37]:
# search_results = model_eval_nb(X_data, y_data_regress, n_iter_search=128)
# search_results

In [38]:
# search_results

In [39]:
# Run baseline

In [43]:
test_tweets = []
with open("../data/2019b-testing.json", "r") as in_file:
    for line in in_file:
        tweet_top = json.loads(line)
        tweet = json.loads(tweet_top["allProperties"]["srcjson"])
        test_tweets.append(tweet)
        
X_test_tfidf = vectorizer.transform([t["text"] for t in test_tweets]).toarray()
X_test_other = np.array([other_features(tweet) for tweet in test_tweets])

sources_ftrs = [tweet_sources_index.get(tweet["source"], 0) for tweet in test_tweets]
source_ftr_onehot = []
for ftr in sources_ftrs:
    f = np.zeros(len(tweet_sources_index) + 1)
    f[ftr] = 1.0
    source_ftr_onehot.append(f)
X_test_sources = np.array(source_ftr_onehot)    

X_test_data = np.concatenate([
    X_test_tfidf, 
    X_test_other, 
    X_test_sources,
], axis=1)

print(X_test_data.shape)

(15000, 10114)


In [44]:
fitted_model = BernoulliNB(**nb_params)
fitted_model.fit(X_data, y_data)

BernoulliNB(alpha=0.01579145221181444, binarize=0.7316900686676242,
      class_prior=None, fit_prior=False)

In [45]:
y_test_labels = fitted_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame([{"tweet_id":tup[0], "label": id_to_cat_map[tup[1]]} for tup in labeled_test_data])

df.to_csv("trec2019b_test_results_run_baseline.csv", index=None)

In [46]:

df.groupby("label").count()

Unnamed: 0_level_0,tweet_id
label,Unnamed: 1_level_1
Advice,363
CleanUp,257
ContextualInformation,1913
Discussion,802
Donations,137
EmergingThreats,443
Factoid,1603
FirstPartyObservation,431
GoodsServices,70
Hashtags,554


In [47]:
df["tweet_id"].value_counts()

1122269086338232321    25
1122237890539421697    23
1122258843927445504    19
1122224999845249025    19
1122242433150885888    14
1125943643754377216    12
1126163665248350210    12
1125908981757743105    12
1125868443004227584    12
1122228733144788992    11
1125894485479702528    11
1125861997780852738    11
1126164160624992256    11
1122228151361232896    10
1122236759406981121    10
1122229347119710208    10
1122250898154184704    10
1125921493337169920    10
1126107056778252294    10
1122227608874307585     9
1125949590077956100     9
1122236467651194883     9
1122221393888735232     9
1122237648477786112     9
1122257063336398848     9
1125919362785792000     9
1122250185776746498     9
1122244508702216192     9
1122270464330485761     9
1122247870378315776     8
                       ..
1134427602758725632     1
1122500084720181248     1
1134844467444191232     1
1120301644887904257     1
1134428183493668864     1
1120635938219028480     1
1134463613253513218     1
112223717679

In [48]:
# fitted_pri_model = BernoulliNB(**nb_params)
fitted_pri_model = RandomForestClassifier(**rf_priority_params)
fitted_pri_model.fit(X_data, y_data_regress)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features=14,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=33,
            min_samples_split=96, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False)

In [49]:
y_test_labels = fitted_pri_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

df = pd.DataFrame([{"tweet_id":tup[0], "priority": tup[1]} for tup in labeled_test_data])

df.to_csv("trec2019b_test_results_priority_run_baseline.csv", index=None)

In [50]:
df["priority"].value_counts()

0    11718
1     3282
Name: priority, dtype: int64

# Test with Multiple Labels

In [51]:
tweet_pairs = []
y_data = []

indexer = {c:i for i, c in enumerate(list(category_to_label.keys()))}
indexer_inv = {i:c for c,i in indexer.items()}
for tweet_id, categories in category_df.groupby("tweet_id"):
    
    tup = (
        tweet_id_map[np.int64(tweet_id)], 
        [indexer[category] for category in categories["category"] if category != 'Unknown']
    )
    tweet_pairs.append(tup)

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

In [52]:
y_data_ = [tp[1] for tp in tweet_pairs]

def one_hot_y(y_list):
    encoded = [0] * len(indexer)
    for y in y_list:
        encoded[y] = 1
    return encoded

y_data = np.array([one_hot_y(y) for y in y_data_])

In [53]:
tfidf = vectorizer.transform(tweet_texts).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [54]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])
other_ftr_data.shape

(26144, 16)

In [62]:
sources_ftrs = [tweet_sources_index.get(tweet["source"], 0) for tweet, _ in tweet_pairs]
source_ftr_onehot = []
for ftr in sources_ftrs:
    f = np.zeros(len(tweet_sources_index) + 1)
    f[ftr] = 1.0
    source_ftr_onehot.append(f)
X_test_sources = np.array(source_ftr_onehot)

In [63]:
X_data = np.concatenate([
    tfidf, 
    other_ftr_data, 
    X_test_sources,
], axis=1)

ftr_names_ = np.concatenate([
    np.array([x for x in vocab]), 
    other_features_names, 
    ["Unknown"] + list(tweet_sources_index.keys()),
])

print(X_data.shape, y_data.shape)

(26144, 10114) (26144, 25)


In [64]:
skf = KFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]    
    X_test = X_data[test]
    
    y_train = y_data[train]
    y_test = y_data[test]


    # train
    fitted_model = sklearn.multiclass.OneVsRestClassifier(
#         RandomForestClassifier(**rf_params)
        BernoulliNB(**nb_params)
    )
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

  'recall', 'true', average, warn_for)


	Accuracy: 0.015296367112810707
	F1: 0.38791710192607903
	Accuracy: 0.0130019120458891
	F1: 0.39642833057968724


  'recall', 'true', average, warn_for)


	Accuracy: 0.012619502868068833
	F1: 0.5142703431992867


  'recall', 'true', average, warn_for)


	Accuracy: 0.021797323135755258
	F1: 0.5433356525055141


  'recall', 'true', average, warn_for)


	Accuracy: 0.010711553175210406
	F1: 0.3919908923692285


  'recall', 'true', average, warn_for)


	Accuracy: 0.029839326702371844
	F1: 0.40033223606947715


  'recall', 'true', average, warn_for)


	Accuracy: 0.05547054322876817
	F1: 0.44991348915080664
	Accuracy: 0.01874521805661821
	F1: 0.46942190245055354


  'recall', 'true', average, warn_for)


	Accuracy: 0.0279265493496557
	F1: 0.4548733602034812
	Accuracy: 0.027161438408569244
	F1: 0.3634248923787077
Accuracy: 0.3043774280138184
F1: 0.5344177990912915


In [65]:
prs = fitted_model.predict(X_test)
prs

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [66]:
for i, x in enumerate(prs.sum(axis=0)):
    print(i, indexer_inv[i], x)

0 Advice 695
1 CleanUp 161
2 ContextualInformation 266
3 Discussion 416
4 Donations 178
5 EmergingThreats 373
6 Factoid 346
7 FirstPartyObservation 1066
8 GoodsServices 139
9 Hashtags 995
10 InformationWanted 256
11 Location 178
12 MovePeople 201
13 MultimediaShare 467
14 NewSubEvent 261
15 News 724
16 Official 284
17 OriginalEvent 196
18 SearchAndRescue 95
19 Sentiment 925
20 ServiceAvailable 219
21 ThirdPartyObservation 481
22 Volunteer 156
23 Weather 344
24 Irrelevant 848


In [67]:
clf = sklearn.multiclass.OneVsRestClassifier(
#         RandomForestClassifier(**rf_params)
    BernoulliNB(**nb_params)
)

In [68]:
clf.fit(X_data, y_data)

OneVsRestClassifier(estimator=BernoulliNB(alpha=0.01579145221181444, binarize=0.7316900686676242,
      class_prior=None, fit_prior=False),
          n_jobs=None)

In [69]:
y_test_labels = clf.predict(X_test_data)


In [70]:
all_rows = []
for tid, row_labels in zip([t["id"] for t in test_tweets], y_test_labels):
    
    row = [tid] + row_labels.tolist()
    all_rows.append(row)


In [71]:

id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame(all_rows, columns=["tweet_id"] + [indexer_inv[i] for i in range(len(indexer_inv))])

df.to_csv("trec2019b_test_results_run_baseline_multi.csv", index=None)

In [None]:
df

# 