In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import string
import gzip
import json
import re

In [3]:
import numpy as np
import pandas as pd

In [4]:
import scipy
from scipy import interpolate

In [5]:
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [9]:
tweet_id_map = {}
with open("../data/2018-testing.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        tweet_id_map[np.int64(tweet["id"])] = tweet
        
with open("../data/2018-training.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        tweet_id_map[np.int64(tweet["id"])] = tweet

print("Total Tweet Count:", len(tweet_id_map))

Total Tweet Count: 25923


In [10]:
lang_count_map = {}
for lang in [tweet["lang"] for tweet in tweet_id_map.values()]:
    lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
lang_count_map

{'en': 25674,
 'es': 57,
 'tl': 97,
 'it': 3,
 'fr': 13,
 'pt': 9,
 'und': 42,
 'hi': 9,
 'nl': 3,
 'ca': 2,
 'eu': 1,
 'in': 2,
 'ro': 3,
 'et': 1,
 'de': 1,
 'cs': 1,
 'ht': 1,
 'ja': 3,
 'th': 1}

In [71]:
tweet_priority_map = {}
priority_df = pd.read_csv("tweet_to_priority.csv", dtype={"tweet_id": np.int64})
for row in priority_df.itertuples():
    tweet_id = row.tweet_id
    
    tweet_priority_map[tweet_id] = {
        "score": row.score_mean,
        "weight": 1.0 - row.score_std
    }

In [12]:
tweet_category_map = {}
category_df = pd.read_csv("tweet_to_category.csv")
for category, tweets in category_df.groupby("category"):
    tweet_category_map[category] = list(tweets["tweet_id"])

In [17]:
for category, tweet_ids in tweet_category_map.items():
    retrieved_count = sum([1 if np.int64(tid) in tweet_id_map else 0 in tweet_id_map for tid in tweet_ids])
    print("Category:", category)
    print("\tTweet Count:", len(tweet_ids), "Retrieved Fraction:", retrieved_count/len(tweet_ids))
    
    lang_count_map = {}
    for lang in [tweet_id_map[np.int64(tid)]["lang"] for tid in tweet_ids if int(tid) in tweet_id_map]:
        lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
    print("\t", str(lang_count_map))

Category: Advice
	Tweet Count: 1236 Retrieved Fraction: 1.0
	 {'en': 1235, 'und': 1}
Category: CleanUp
	Tweet Count: 61 Retrieved Fraction: 1.0
	 {'en': 61}
Category: ContinuingNews
	Tweet Count: 5286 Retrieved Fraction: 1.0
	 {'en': 5275, 'ja': 1, 'tl': 4, 'und': 2, 'hi': 1, 'fr': 1, 'es': 1, 'pt': 1}
Category: Discussion
	Tweet Count: 2241 Retrieved Fraction: 1.0
	 {'en': 2199, 'und': 5, 'hi': 1, 'es': 27, 'pt': 3, 'it': 1, 'fr': 2, 'tl': 1, 'nl': 2}
Category: Donations
	Tweet Count: 811 Retrieved Fraction: 1.0
	 {'en': 811}
Category: EmergingThreats
	Tweet Count: 732 Retrieved Fraction: 1.0
	 {'en': 731, 'fr': 1}
Category: Factoid
	Tweet Count: 2502 Retrieved Fraction: 1.0
	 {'en': 2496, 'tl': 4, 'und': 1, 'ca': 1}
Category: FirstPartyObservation
	Tweet Count: 3712 Retrieved Fraction: 1.0
	 {'en': 3707, 'hi': 2, 'und': 2, 'tl': 1}
Category: GoodsServices
	Tweet Count: 126 Retrieved Fraction: 1.0
	 {'en': 126}
Category: Hashtags
	Tweet Count: 3261 Retrieved Fraction: 1.0
	 {'en': 324

In [18]:
# But first, read in stopwrods
enStop = stopwords.words('english')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop +\
    ["http", "https", "rt", "@", ":", "t.co", "co", "amp", "&amp;", "...", "\n", "\r"]
stopList.extend(string.punctuation)

In [19]:
# def tokenizer_wrapper(text):
#     return [t.lemma_ for t in nlp(text)]

local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

In [20]:
# Generate Additional Features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Taken from Davidson et al.
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    tweet_text = tweet["text"]
    
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet_text)
    
    words = local_tokenizer.tokenize(tweet_text) #Get text only
    
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet_text)
    num_terms = len(tweet_text.split())
    num_words = len(words)
    num_unique_terms = len(set([x.lower() for x in words]))
    
    caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
    caps_ratio = caps_count / num_chars_total
    
    twitter_objs = count_twitter_objs(tweet_text) #Count #, @, and http://
    num_media = 0
    if "entities" in tweet and "media" in tweet["entities"]:
        num_media = len(tweet["entities"]["media"])
    retweet = 0
    if "rt" in words or "retweeted_status" in tweet:
        retweet = 1
        
    has_place = 1 if "coordinates" in tweet else 0
        
    author = tweet["user"]
    is_verified = 1 if ("verified" in author and author["verified"]) else 0
    log_followers = 0
    if "followers_count" in author and author["followers_count"] > 0:
         log_followers = np.log(author["followers_count"])
    log_friends = 0
    if "friends_count" in author and author["friends_count"] > 0:
         log_followers = np.log(author["friends_count"])
    
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], 
                sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, num_media,
                is_verified, 
#                 log_followers, log_friends,
#                 has_place,
                caps_ratio,
               ]

    return [round(x, 4) for x in features]

other_features_names = ["num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos",
                        "vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", 
                        "num_urls", "is_retweet", "num_media",
                        "is_verified", 
#                         "log_followers", "log_friends",
#                         "has_place",
                        "caps_ratio",
                       ]

In [21]:
category_to_label = {c:i+1 for i, c in enumerate(tweet_category_map.keys()) if c != "Irrelevant"}
category_to_label["Irrelevant"] = 0

tweet_id_to_category = {}
for category, tweet_ids in tweet_category_map.items():
    if ( len(tweet_ids) < 5 ):
        print("Skipping category:", category)
        continue
        
    for tweet_id in tweet_ids:
        tweet_id_to_category[np.int64(tweet_id)] = category_to_label[category]
        

In [22]:
# tweet_pairs = [(tweet, tweet_id_to_category[tid]) 
#                for tid, tweet in tweet_id_map.items() if tid in tweet_id_to_category]

tweet_pairs = [(tweet_id_map[np.int64(tweet)], category_to_label[category]) 
               for category, tweet_ids in tweet_category_map.items() for tweet in tweet_ids]

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

y_data = np.array([tp[1] for tp in tweet_pairs])

In [23]:
print("Tweets with Categories:", y_data.shape[0])

Tweets with Categories: 44849


In [24]:
#Construct tfidf matrix and get relevant scores
vectorizer = joblib.load("../models/2013to2016_tfidf_vectorizer_20190109.pkl")



In [25]:
tfidf = vectorizer.transform(tweet_texts).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [26]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])

In [27]:
X_data = np.concatenate([
    tfidf, 
    other_ftr_data, 
], axis=1)

ftr_names_ = np.concatenate([
    np.array([x for x in vocab]), 
    other_features_names, 
])

print(X_data.shape, y_data.shape)

(44849, 10016) (44849,)


In [28]:
print("Dictionary Word Count:", len(vocab))
print([x[0] for x in vocab.items()][-10:])

Dictionary Word Count: 10000
['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']


In [29]:
r_state = 1337

In [30]:
category_number_list = list(category_to_label.values())
d = len(category_number_list)

f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=5, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    # Get actual labels
    y_test = y_data[test]

    # Compute Precision-Recall 
    y_infer_local = [category_number_list[np.random.randint(0, d)] for smaple in y_test]
    f1_accum.append(f1_score(y_test, y_infer_local, average="macro"))
    
    accuracy_accum.append(sklearn.metrics.accuracy_score(y_test, y_infer_local))

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

Accuracy: 0.03924144671095087
F1: 0.030275592115913286


In [31]:
rf_params = {
    'n_estimators': 128, 
    "n_jobs": -1,
    'random_state': r_state,
    'class_weight': "balanced",
    'criterion': 'gini',
    'max_depth': 32,
    'max_features': 113,
    'min_samples_leaf': 2,
    'min_samples_split': 54,
}

nb_params = {
    'alpha': 0.01579145221181444,
    'binarize': 0.7316900686676242,
    'fit_prior': False
}

In [32]:
f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]
    y_train = y_data[train]
    
    X_test = X_data[test]
    y_test = y_data[test]

    # train
#     fitted_model = RandomForestClassifier(**rf_params)
    fitted_model = BernoulliNB(**nb_params)
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="macro")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Accuracy: 0.1206934874416537
	F1: 0.10406664550436698
	Accuracy: 0.1484531493434231
	F1: 0.13848774249549733
	Accuracy: 0.17000891265597148
	F1: 0.16761788094717406
	Accuracy: 0.18159536541889484
	F1: 0.18231993316068515
	Accuracy: 0.17235228539576367
	F1: 0.1548761592074575
	Accuracy: 0.19732441471571907
	F1: 0.14977674892897416
	Accuracy: 0.19058245927248382
	F1: 0.1667543657335771
	Accuracy: 0.14244250948872517
	F1: 0.14104919754416437
	Accuracy: 0.1409740840035746
	F1: 0.12106853485203678
	Accuracy: 0.15776536312849163
	F1: 0.12432007801439689
Accuracy: 0.16221920308647012
F1: 0.14503372863883304


In [30]:
label_to_category = {j:i for i, j in category_to_label.items()}
for positive_category in category_number_list:
    local_y_data = [1 if y == positive_category else 0 for y in y_data]
    
    fitted_model = RandomForestClassifier(**rf_params)
    fitted_model.fit(X_data, local_y_data)
    
    weights = [(ftr_names_[idx], coef) 
               for idx, coef in enumerate(fitted_model.feature_importances_)]

    tops = sorted(weights, key=lambda x: x[1], reverse=True)[:10]
    
    print("Label:", label_to_category[positive_category])
    print("Score:", fitted_model.score(X_data, local_y_data))
    for token, weight in tops:
        print("\t", token, weight)

Label: Advice
Score: 0.8280452183995184
	 vader compound 0.03878311583427708
	 vader neg 0.031161864986553295
	 vader pos 0.03048997873755411
	 num_chars_total 0.028157230317374278
	 vader neu 0.023917431012549534
	 num_chars 0.021564563637216937
	 num_words 0.020558959061376583
	 num_mentions 0.020015097901446472
	 num_terms 0.019692160495374984
	 num_unique_words 0.01916550717550718
Label: CleanUp
Score: 0.9967669290285179
	 num_chars_total 0.034377715407084966
	 num_unique_words 0.03008204835909185
	 caps_ratio 0.026499183091835167
	 num_terms 0.025637179276475963
	 num_chars 0.02543070090461388
	 begins 0.02481314297623263
	 vader neu 0.02346740362332438
	 vader compound 0.02264039259201363
	 vader neg 0.021787222001480516
	 vader pos 0.021718802543273023
Label: ContinuingNews
Score: 0.6506053646681086
	 num_urls 0.052669778192661616
	 vader pos 0.032985426731923013
	 num_chars 0.03276161229930145
	 num_chars_total 0.03234680476958042
	 vader compound 0.027246332171901422
	 caps_ra

In [73]:
def smapper(x):
    if ( x < 0.75 ):
        return 0
    else:
        return 1

y_data_regress = np.array([smapper(tweet_priority_map[np.int64(tp[0]["id"])]["score"]) 
                           for tp in tweet_pairs])
y_data_weights = np.array([tweet_priority_map[np.int64(tp[0]["id"])]["weight"] 
                           for tp in tweet_pairs])

In [74]:
score_accum = []
f1_accum = []
accuracy_accum = []

rf_priority_params = {
    'random_state': r_state,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
}

nb_priority_params = {
    'alpha': 0.05134305647695325,
    'binarize': 0.045909955637688404,
    'fit_prior': True,
}

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data_regress):

    X_train = X_data[train]
    y_train = y_data_regress[train]
    y_weight = y_data_weights[train]
    
    X_test = X_data[test]
    y_test = y_data_regress[test]

    # train
    print("\tFitting...")
#     fitted_model = sklearn.linear_model.LinearRegression(n_jobs=4)
#     fitted_model = sklearn.tree.DecisionTreeRegressor(random_state=r_state, max_depth=256)
    fitted_model = BernoulliNB(**nb_priority_params)
#     fitted_model = RandomForestClassifier(**rf_priority_params)
    fitted_model.fit(X_train, y_train, y_weight)

    # Compute score metrics
    r2_score = fitted_model.score(X_test, y_test)
    score_accum.append(r2_score)
    
    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    f1_accum.append(f1_score(y_test, y_infer_local, average="macro"))
    
    accuracy_accum.append(fitted_model.score(X_test, y_test))
    
    print("\t", r2_score)

print("Score (R^2):", np.mean(score_accum))
print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Fitting...
	 0.7940258582255907
	Fitting...
	 0.8735785953177257
	Fitting...
	 0.8245261984392419
	Fitting...
	 0.896098104793757
	Fitting...
	 0.8925306577480491
	Fitting...
	 0.8722408026755852
	Fitting...
	 0.8709030100334448
	Fitting...
	 0.8691192865105909
	Fitting...
	 0.8947368421052632
	Fitting...
	 0.8490187332738626
Score (R^2): 0.863677808912311
Accuracy: 0.863677808912311
F1: 0.7375094467912346


In [39]:
def random_search(X_data, y_data, clf, param_dist, n_iter_search=20, r_state=1337):
    # run randomized search
    random_search = RandomizedSearchCV(clf, 
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=10,
                                       scoring="f1_macro",
                                       random_state=r_state,
                                       verbose=2,
                                       n_jobs=16,
                                      )
    
    random_search.fit(X_data, y_data)

    return (random_search.best_score_, random_search.best_params_)

def model_eval_rf(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=r_state)
    
    # specify parameters and distributions to sample from
    param_dist = {
        "max_depth": [2, 4, 8, 16, 32, 64, 128, None],
        "max_features": scipy.stats.randint(1, 512),
        "min_samples_split": scipy.stats.randint(2, 512),
        "min_samples_leaf": scipy.stats.randint(2, 512),
        "criterion": ["gini", "entropy"],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

def model_eval_nb(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = BernoulliNB()
    
    # specify parameters and distributions to sample from
    param_dist = {
        "alpha": scipy.stats.uniform(),
        "binarize": scipy.stats.uniform(),
        "fit_prior": [True, False],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

In [None]:
search_results = model_eval_nb(X_data, y_data_regress, n_iter_search=128)
search_results

In [None]:
search_results

In [None]:
# Run baseline

In [75]:
test_tweets = []
with open("../data/2019-testing.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        test_tweets.append(tweet)
        
X_test_tfidf = vectorizer.transform([t["text"] for t in test_tweets]).toarray()
X_test_other = np.array([other_features(tweet) for tweet in test_tweets])

X_test_data = np.concatenate([
    X_test_tfidf, 
    X_test_other, 
], axis=1)

print(X_test_data.shape)

(9503, 10016)


In [76]:
fitted_model = BernoulliNB(**nb_params)
fitted_model.fit(X_data, y_data)

BernoulliNB(alpha=0.01579145221181444, binarize=0.7316900686676242,
      class_prior=None, fit_prior=False)

In [77]:
y_test_labels = fitted_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame([{"tweet_id":tup[0], "label": id_to_cat_map[tup[1]]} for tup in labeled_test_data])

df.to_csv("trec2019_test_results_run_baseline.csv", index=None)

In [78]:

df.groupby("label").count()

Unnamed: 0_level_0,tweet_id
label,Unnamed: 1_level_1
Advice,535
CleanUp,87
ContinuingNews,567
Discussion,856
Donations,84
EmergingThreats,208
Factoid,317
FirstPartyObservation,861
GoodsServices,58
Hashtags,669


In [79]:
# fitted_pri_model = BernoulliNB(**nb_params)
fitted_pri_model = RandomForestClassifier(**rf_priority_params)
fitted_pri_model.fit(X_data, y_data_regress)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features=14,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=33,
            min_samples_split=96, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False)

In [80]:
y_test_labels = fitted_pri_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

df = pd.DataFrame([{"tweet_id":tup[0], "priority": tup[1]} for tup in labeled_test_data])

df.to_csv("trec2019_test_results_priority_run_baseline.csv", index=None)

In [81]:
df["priority"].value_counts()

0    5621
1    3882
Name: priority, dtype: int64

# Test with Multiple Labels

In [82]:
tweet_pairs = []
y_data = []

indexer = {c:i for i, c in enumerate(list(category_to_label.keys()))}
indexer_inv = {i:c for c,i in indexer.items()}
for tweet_id, categories in category_df.groupby("tweet_id"):
    
    tup = (
        tweet_id_map[np.int64(tweet_id)], 
        [indexer[category] for category in categories["category"]]
    )
    tweet_pairs.append(tup)

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

In [83]:
y_data_ = [tp[1] for tp in tweet_pairs]

def one_hot_y(y_list):
    encoded = [0] * len(indexer)
    for y in y_list:
        encoded[y] = 1
    return encoded

y_data = np.array([one_hot_y(y) for y in y_data_])

In [84]:
tfidf = vectorizer.transform(tweet_texts).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [85]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])
other_ftr_data.shape

(19046, 16)

In [86]:
X_data = np.concatenate([
    tfidf, 
    other_ftr_data, 
], axis=1)

ftr_names_ = np.concatenate([
    np.array([x for x in vocab]), 
    other_features_names, 
])

print(X_data.shape, y_data.shape)

(19046, 10016) (19046, 25)


In [35]:
skf = KFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]    
    X_test = X_data[test]
    
    y_train = y_data[train]
    y_test = y_data[test]


    # train
    fitted_model = sklearn.multiclass.OneVsRestClassifier(
#         RandomForestClassifier(**rf_params)
        BernoulliNB(**nb_params)
    )
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Accuracy: 0.008923884514435695
	F1: 0.3378893864945777
	Accuracy: 0.01837270341207349
	F1: 0.40898713580842294
	Accuracy: 0.008923884514435695
	F1: 0.3792302371501507
	Accuracy: 0.014698162729658792
	F1: 0.5580427641562706


  'recall', 'true', average, warn_for)


	Accuracy: 0.02047244094488189
	F1: 0.5359653014805524


  'recall', 'true', average, warn_for)


	Accuracy: 0.009973753280839895
	F1: 0.42668238632976296
	Accuracy: 0.009978991596638655
	F1: 0.3903126971165963


  'recall', 'true', average, warn_for)


	Accuracy: 0.037815126050420166
	F1: 0.44823249305047735


  'recall', 'true', average, warn_for)


	Accuracy: 0.045168067226890755
	F1: 0.46605167158865374


  'recall', 'true', average, warn_for)


	Accuracy: 0.025210084033613446
	F1: 0.32361054703601894
Accuracy: 0.026613707944485435
F1: 0.29507235518943137


In [36]:
prs = fitted_model.predict(X_test)
prs

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [37]:
for i, x in enumerate(prs.sum(axis=0)):
    print(i, indexer_inv[i], x)

0 Advice 104
1 CleanUp 64
2 ContinuingNews 456
3 Discussion 518
4 Donations 33
5 EmergingThreats 204
6 Factoid 377
7 FirstPartyObservation 439
8 GoodsServices 66
9 Hashtags 258
10 InformationWanted 46
11 KnownAlready 276
12 MovePeople 77
13 MultimediaShare 331
14 Official 111
15 PastNews 181
16 SearchAndRescue 93
17 Sentiment 375
18 ServiceAvailable 91
19 SignificantEventChange 286
20 ThirdPartyObservation 441
21 Unknown 250
22 Volunteer 38
23 Weather 11
24 Irrelevant 851


In [87]:
clf = sklearn.multiclass.OneVsRestClassifier(
#         RandomForestClassifier(**rf_params)
    BernoulliNB(**nb_params)
)

In [88]:
clf.fit(X_data, y_data)

OneVsRestClassifier(estimator=BernoulliNB(alpha=0.01579145221181444, binarize=0.7316900686676242,
      class_prior=None, fit_prior=False),
          n_jobs=None)

In [89]:
y_test_labels = clf.predict(X_test_data)


In [90]:
all_rows = []
for tid, row_labels in zip([t["id"] for t in test_tweets], y_test_labels):
    
    row = [tid] + row_labels.tolist()
    all_rows.append(row)


In [91]:

id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame(all_rows, columns=["tweet_id"] + [indexer_inv[i] for i in range(len(indexer_inv))])

df.to_csv("trec2019_test_results_run_baseline_multi.csv", index=None)

In [92]:
df

Unnamed: 0,tweet_id,Advice,CleanUp,ContinuingNews,Discussion,Donations,EmergingThreats,Factoid,FirstPartyObservation,GoodsServices,...,PastNews,SearchAndRescue,Sentiment,ServiceAvailable,SignificantEventChange,ThirdPartyObservation,Unknown,Volunteer,Weather,Irrelevant
0,750271842061914112,0,0,1,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
1,750297964963389440,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,750316221552263168,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,750336750241873922,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,750337795735060480,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,750340917777739776,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,750361050344673280,0,0,1,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
7,750371242822086657,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,750391150352093184,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,750432662242861056,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
