In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import string
import gzip
import json
import re

In [3]:
import numpy as np
import pandas as pd

In [4]:
import scipy
from scipy import interpolate

In [5]:
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [9]:
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors

In [10]:
tweet_id_map = {}
with open("../data/2018-testing.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        tweet_id_map[np.int64(tweet["id"])] = tweet
        
with open("../data/2018-training.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        tweet_id_map[np.int64(tweet["id"])] = tweet

print("Total Tweet Count:", len(tweet_id_map))

Total Tweet Count: 25923


In [11]:
lang_count_map = {}
for lang in [tweet["lang"] for tweet in tweet_id_map.values()]:
    lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
lang_count_map

{'en': 25674,
 'es': 57,
 'tl': 97,
 'it': 3,
 'fr': 13,
 'pt': 9,
 'und': 42,
 'hi': 9,
 'nl': 3,
 'ca': 2,
 'eu': 1,
 'in': 2,
 'ro': 3,
 'et': 1,
 'de': 1,
 'cs': 1,
 'ht': 1,
 'ja': 3,
 'th': 1}

In [32]:
tweet_priority_map = {}
priority_df = pd.read_csv("tweet_to_priority.csv", dtype={"tweet_id": np.int64})
for row in priority_df.itertuples():
    tweet_id = row.tweet_id
    
    tweet_priority_map[tweet_id] = {
        "score": row.score_mean,
        "weight": 1.0 - row.score_std
    }

In [13]:
tweet_category_map = {}
category_df = pd.read_csv("tweet_to_category.csv")
for category, tweets in category_df.groupby("category"):
    tweet_category_map[category] = list(tweets["tweet_id"])

In [14]:
print("Labels:", sum([len(v) for v in tweet_category_map.values()]))

Labels: 44849


In [15]:
for category, tweet_ids in tweet_category_map.items():
    retrieved_count = sum([1 if np.int64(tid) in tweet_id_map else 0 in tweet_id_map for tid in tweet_ids])
    print("Category:", category)
    print("\tTweet Count:", len(tweet_ids), "Retrieved Fraction:", retrieved_count/len(tweet_ids))
    
    lang_count_map = {}
    for lang in [tweet_id_map[np.int64(tid)]["lang"] for tid in tweet_ids if np.int64(tid) in tweet_id_map]:
        lang_count_map[lang] = lang_count_map.get(lang, 0) + 1
    print("\t", str(lang_count_map))

Category: Advice
	Tweet Count: 1236 Retrieved Fraction: 1.0
	 {'en': 1235, 'und': 1}
Category: CleanUp
	Tweet Count: 61 Retrieved Fraction: 1.0
	 {'en': 61}
Category: ContinuingNews
	Tweet Count: 5286 Retrieved Fraction: 1.0
	 {'en': 5275, 'ja': 1, 'tl': 4, 'und': 2, 'hi': 1, 'fr': 1, 'es': 1, 'pt': 1}
Category: Discussion
	Tweet Count: 2241 Retrieved Fraction: 1.0
	 {'en': 2199, 'und': 5, 'hi': 1, 'es': 27, 'pt': 3, 'it': 1, 'fr': 2, 'tl': 1, 'nl': 2}
Category: Donations
	Tweet Count: 811 Retrieved Fraction: 1.0
	 {'en': 811}
Category: EmergingThreats
	Tweet Count: 732 Retrieved Fraction: 1.0
	 {'en': 731, 'fr': 1}
Category: Factoid
	Tweet Count: 2502 Retrieved Fraction: 1.0
	 {'en': 2496, 'tl': 4, 'und': 1, 'ca': 1}
Category: FirstPartyObservation
	Tweet Count: 3712 Retrieved Fraction: 1.0
	 {'en': 3707, 'hi': 2, 'und': 2, 'tl': 1}
Category: GoodsServices
	Tweet Count: 126 Retrieved Fraction: 1.0
	 {'en': 126}
Category: Hashtags
	Tweet Count: 3261 Retrieved Fraction: 1.0
	 {'en': 324

In [16]:
# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = ["http", "https", "rt", "@", ":", "t.co", "co", "amp", "&amp;", "...", "\n", "\r"]
stopList.extend(string.punctuation)
# stopList.extend(stopwords.words("english"))

In [17]:
# def tokenizer_wrapper(text):
#     return [t.lemma_ for t in nlp(text)]

local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

In [18]:
# Generate Additional Features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Taken from Davidson et al.
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    tweet_text = tweet["text"]
    
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet_text)
    
    words = local_tokenizer.tokenize(tweet_text) #Get text only
    
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet_text)
    num_terms = len(tweet_text.split())
    num_words = len(words)
    num_unique_terms = len(set([x.lower() for x in words]))
    
    caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
    caps_ratio = caps_count / num_chars_total
    
    twitter_objs = count_twitter_objs(tweet_text) #Count #, @, and http://
    num_media = 0
    if "entities" in tweet and "media" in tweet["entities"]:
        num_media = len(tweet["entities"]["media"])
    retweet = 0
    if "rt" in words or "retweeted_status" in tweet:
        retweet = 1
        
    has_place = 1 if "coordinates" in tweet else 0
        
    author = tweet["user"]
    is_verified = 1 if ("verified" in author and author["verified"]) else 0
    log_followers = 0
    if "followers_count" in author and author["followers_count"] > 0:
         log_followers = np.log(author["followers_count"])
    log_friends = 0
    if "friends_count" in author and author["friends_count"] > 0:
         log_followers = np.log(author["friends_count"])
    
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], 
                sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, num_media,
                is_verified, 
#                 log_followers, log_friends,
#                 has_place,
                caps_ratio,
               ]

    return [round(x, 4) for x in features]

other_features_names = ["num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos",
                        "vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", 
                        "num_urls", "is_retweet", "num_media",
                        "is_verified", 
#                         "log_followers", "log_friends",
#                         "has_place",
                        "caps_ratio",
                       ]

In [19]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    tokenizer=tokenizer_wrapper,
    ngram_range=(1, 1),
    stop_words=stopList, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=4,
    max_df=0.501
    )

In [67]:
category_to_label = {c:i for i, c in enumerate(tweet_category_map.keys()) if c != "Irrelevant"}
category_to_label["Irrelevant"] = -1

tweet_id_to_category = {}
for category, tweet_ids in tweet_category_map.items():
    if ( len(tweet_ids) < 5 ):
        print("Skipping category:", category)
        continue
        
    for tweet_id in tweet_ids:
        tweet_id_to_category[np.int64(tweet_id)] = category_to_label[category]
        

In [68]:
# tweet_pairs = [(tweet, tweet_id_to_category[tid]) 
#                for tid, tweet in tweet_id_map.items() if tid in tweet_id_to_category]

tweet_pairs = [(tweet_id_map[np.int64(tweet)], category_to_label[category]) 
               for category, tweet_ids in tweet_category_map.items() for tweet in tweet_ids]

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

y_data = np.array([tp[1] for tp in tweet_pairs])

In [69]:
print("Samples:", len(tweet_pairs), len(y_data))

Samples: 44849 44849


In [23]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings
    """
    s = s.lower()

    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', 'zero')
    s = s.replace('1', 'one')
    s = s.replace('2', 'two')
    s = s.replace('3', 'three')
    s = s.replace('4', 'four')
    s = s.replace('5', 'five')
    s = s.replace('6', 'six')
    s = s.replace('7', 'seven')
    s = s.replace('8', 'eight')
    s = s.replace('9', 'nine')

    return s

analyzer = vectorizer.build_analyzer()
def ft_tokenizer(text):
    return [normalize(t) for t in analyzer(text)]

In [24]:
# model_gensim = FastText.load('text_sample_2015_gensim.model')
model_gensim = FastText.load('../models/text_sample_2013to2016_gensim_200.model')
# model_gensim = FastText.load_fasttext_format('../data/cc.en.300.bin')
wvs = model_gensim.wv

In [25]:
def vectorize(sentence):
    tokenized = [normalize(t) for t in analyzer(sentence)]
    
    wv_vecs = []
    for t in tokenized:

        try:
            v = wvs[t]
            norm = np.linalg.norm(v)
            normed_v = (v / norm)
            wv_vecs.append(normed_v)
        except:
            continue
    
    m = np.array(wv_vecs)
    normed_m = np.mean(m, axis=0)

    return normed_m

In [70]:
ft_features_ = [vectorize(s) for s in tweet_texts]
ft_features = np.array([x for x in ft_features_])
ft_features.shape

(44849, 200)

In [71]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])
other_ftr_data.shape

(44849, 16)

In [72]:
X_data = np.concatenate([
    ft_features, 
    other_ftr_data, 
#     pos
], axis=1)

print(X_data.shape, y_data.shape)

(44849, 216) (44849,)


In [29]:
r_state = 1337

In [None]:
rf_params = {
    'n_estimators': 128, 
    "n_jobs": -1,
    'random_state': r_state,
    'class_weight': "balanced",
    'criterion': 'gini',
    'max_depth': 32,
    'max_features': 113,
    'min_samples_leaf': 2,
    'min_samples_split': 54,
}

nb_params = {
    'alpha': 0.6836531055077686,
    'binarize': 0.027689715150536642,
    'fit_prior': True
}

In [30]:
f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]
    y_train = y_data[train]
    
    X_test = X_data[test]
    y_test = y_data[test]

    # train
    fitted_model = RandomForestClassifier(**rf_params)
#     fitted_model = BernoulliNB(**nb_params)
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="macro")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Accuracy: 0.11269170926872639
	F1: 0.10193283060329068
	Accuracy: 0.11128421989761851
	F1: 0.10203009633774549
	Accuracy: 0.11720142602495544
	F1: 0.13123838549707853
	Accuracy: 0.1323529411764706
	F1: 0.14218445417061035
	Accuracy: 0.14626532887402452
	F1: 0.14401338264046543
	Accuracy: 0.1629877369007804
	F1: 0.1295520063443928
	Accuracy: 0.15442981477348805
	F1: 0.15127712477894503
	Accuracy: 0.1134181736994865
	F1: 0.13532551924977937
	Accuracy: 0.08109919571045576
	F1: 0.11720975760158868
	Accuracy: 0.1153072625698324
	F1: 0.10071373038158765
Accuracy: 0.12470378088958387
F1: 0.1255477287605484


In [None]:
category_number_list = list(category_to_label.values())
label_to_category = {j:i for i, j in category_to_label.items()}
for positive_category in category_number_list:
    local_y_data = [1 if y == positive_category else 0 for y in y_data]
    
#     fitted_model = RandomForestClassifier(**rf_params)
    fitted_model = BernoulliNB(**nb_params)
    fitted_model.fit(X_data, local_y_data)
   
    print("Label:", label_to_category[positive_category])
    print("Score:", fitted_model.score(X_data, local_y_data))


In [58]:
def random_search(X_data, y_data, clf, param_dist, n_iter_search=20, r_state=1337):
    # run randomized search
    random_search = RandomizedSearchCV(clf, 
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=10,
                                       scoring="f1_macro",
                                       random_state=r_state,
                                       verbose=2,
                                       n_jobs=-1,
                                      )
    
    random_search.fit(X_data, y_data)

    return (random_search.best_score_, random_search.best_params_)

def model_eval_rf(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=r_state)
    
    # specify parameters and distributions to sample from
    param_dist = {
        "max_depth": scipy.stats.randint(2, 8),
        "max_features": scipy.stats.randint(2, min(128, X_data.shape[1])),
        "min_samples_split": scipy.stats.randint(2, 512),
        "min_samples_leaf": scipy.stats.randint(2, 512),
#         "criterion": ["gini", "entropy"],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

def model_eval_nb(X_data, y_data, n_iter_search=100, r_state=1337):

    clf = BernoulliNB()
    
    # specify parameters and distributions to sample from
    param_dist = {
        "alpha": scipy.stats.uniform(),
        "binarize": scipy.stats.uniform(),
        "fit_prior": [True, False],
    }
    
    return random_search(X_data, y_data, clf, param_dist, n_iter_search=n_iter_search, r_state=r_state)

In [None]:
search_results = model_eval_nb(X_data, y_data, n_iter_search=128)
search_results

In [60]:
search_results

(0.12229702240170447,
 {'alpha': 0.6836531055077686,
  'binarize': 0.027689715150536642,
  'fit_prior': True})

In [33]:
def smapper(x):
    if ( x < 0.75 ):
        return 0
    else:
        return 1

y_data_regress = np.array([smapper(tweet_priority_map[np.int64(tp[0]["id"])]["score"]) 
                           for tp in tweet_pairs])
y_data_weights = np.array([tweet_priority_map[np.int64(tp[0]["id"])]["weight"] 
                           for tp in tweet_pairs])

In [38]:
rf_priority_params = {
    'random_state': r_state,
    'class_weight': 'balanced',
    'n_estimators': 128, 
    'n_jobs': -1,
    'max_depth': 50,
    'max_features': 14,
    'min_samples_leaf': 33,
    'min_samples_split': 96,
}

nb_priority_params = {
    'alpha': 0.05134305647695325,
    'binarize': 0.045909955637688404,
    'fit_prior': True,
}



In [80]:
score_accum = []
f1_accum = []
accuracy_accum = []

skf = StratifiedKFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data_regress):

    X_train = X_data[train]
    y_train = y_data_regress[train]
    y_weight = y_data_weights[train]
    
    X_test = X_data[test]
    y_test = y_data_regress[test]

    # train
    print("\tFitting...")
#     fitted_model = sklearn.linear_model.LinearRegression(n_jobs=4)
#     fitted_model = sklearn.tree.DecisionTreeRegressor(random_state=r_state, max_depth=256)
#     fitted_model = BernoulliNB(**nb_priority_params)
    fitted_model = RandomForestClassifier(**rf_priority_params)
    fitted_model.fit(X_train, y_train, y_weight)

    # Compute score metrics
    r2_score = fitted_model.score(X_test, y_test)
    score_accum.append(r2_score)
    
    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    f1_accum.append(f1_score(y_test, y_infer_local, average="macro"))
    
    accuracy_accum.append(fitted_model.score(X_test, y_test))
    
    print("\t", r2_score)

print("Score (R^2):", np.mean(score_accum))
print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

	Fitting...
	 0.8642443156486848
	Fitting...
	 0.9333333333333333
	Fitting...
	 0.8849498327759198
	Fitting...
	 0.9549609810479376
	Fitting...
	 0.9369007803790412
	Fitting...
	 0.9531772575250836
	Fitting...
	 0.9306577480490524
	Fitting...
	 0.9088071348940914
	Fitting...
	 0.9471454058876003
	Fitting...
	 0.9092328278322926
Score (R^2): 0.9223409617373038
Accuracy: 0.9223409617373038
F1: 0.8558994805140546


In [None]:
# Run Classifier

In [86]:
test_tweets = []
with open("../data/2019-testing.json", "r") as in_file:
    for line in in_file:
        tweet = json.loads(line)
        test_tweets.append(tweet)
        
X_test_ft_ = [vectorize(s) for s in [t["text"] for t in test_tweets]]
X_test_ft_ = np.array([x for x in X_test_ft_])

X_test_other = np.array([other_features(tweet) for tweet in test_tweets])

X_test_data = np.concatenate([
    X_test_ft_, 
    X_test_other, 
], axis=1)

print(X_test_data.shape)

Found
Found
Found
(9503, 216)


In [74]:
# fitted_model = BernoulliNB(**nb_params)
fitted_model = RandomForestClassifier(**rf_params)
fitted_model.fit(X_data, y_data)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=32, max_features=113,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=54, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False)

In [79]:
y_test_labels = fitted_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame([
    {"tweet_id":np.int64(tup[0]), "label": id_to_cat_map[tup[1]]} for tup in labeled_test_data],
)

df.to_csv("trec2019_test_results_run_fasttext.csv", index=None)

In [80]:
df.groupby("label").count()

Unnamed: 0_level_0,tweet_id
label,Unnamed: 1_level_1
Advice,1014
CleanUp,29
ContinuingNews,321
Discussion,332
Donations,215
EmergingThreats,212
Factoid,487
FirstPartyObservation,205
GoodsServices,17
Hashtags,803


In [81]:
df["tweet_id"].value_counts()

1100622467733901312    3
1101050969494880256    3
1100698297155633153    3
1100807416327950343    3
1100793945758490624    3
1101035588692701185    3
1101144348144660483    3
1100698558691532800    3
1100865703412928518    3
1100448617926668288    3
1100814740408938497    3
1101181539071782914    3
1101396685098487808    2
1101287326767108098    2
1101204327262232576    2
1101520730225815552    2
1101028104473071616    2
1100823316992950275    2
1100915273362800652    2
1100741253556252672    2
1101104831647940614    2
1101150862926405632    2
1101826026299183104    2
1102315424300089344    2
1101185478664118272    2
1102465381183176704    2
1100957207225499648    2
1101696298393194497    2
1101626660280066048    2
1102115769423405056    2
                      ..
751241406820323328     1
751241399123775488     1
751241388960911360     1
751241382619123712     1
751241354873704448     1
751241632922505216     1
727632633912418304     1
751241326528790530     1
751241334728429568     1


In [39]:
fitted_pri_model = RandomForestClassifier(**rf_priority_params)
fitted_pri_model.fit(X_data, y_data_regress)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features=14,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=33,
            min_samples_split=96, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False)

In [40]:
y_test_labels = fitted_pri_model.predict(X_test_data)

labeled_test_data = list(zip([t["id"] for t in test_tweets], y_test_labels))

df = pd.DataFrame([{"tweet_id":tup[0], "priority": tup[1]} for tup in labeled_test_data])

df.to_csv("trec2019_test_results_priority_run_fasttext.csv", index=None)

In [41]:
df["priority"].value_counts()

0    8608
1     895
Name: priority, dtype: int64

# Test Multi-Label Method

In [42]:
tweet_pairs = []
y_data = []

indexer = {c:i for i, c in enumerate(list(category_to_label.keys()))}
indexer_inv = {i:c for c,i in indexer.items()}
for tweet_id, categories in category_df.groupby("tweet_id"):
    
    tup = (
        tweet_id_map[np.int64(tweet_id)], 
        [indexer[category] for category in categories["category"]]
    )
    tweet_pairs.append(tup)

tweet_texts = [tp[0]["text"] for tp in tweet_pairs]

In [43]:
y_data_ = [tp[1] for tp in tweet_pairs]

def one_hot_y(y_list):
    encoded = [0] * len(indexer)
    for y in y_list:
        encoded[y] = 1
    return encoded

y_data = np.array([one_hot_y(y) for y in y_data_])

In [44]:
ft_features_ = [vectorize(s) for s in tweet_texts]
ft_features = np.array([x for x in ft_features_])
ft_features.shape

(19046, 200)

In [45]:
other_ftr_data = np.array([other_features(tweet) for tweet, _ in tweet_pairs])
other_ftr_data.shape

(19046, 16)

In [46]:
X_data = np.concatenate([
    ft_features, 
    other_ftr_data, 
], axis=1)

print(X_data.shape, y_data.shape)

(19046, 216) (19046, 25)


In [47]:

rf_params = {
    'n_estimators': 128, 
    "n_jobs": -1,
    'random_state': r_state,
    'class_weight': "balanced",
    'criterion': 'gini',
    'max_depth': 32,
    'max_features': 113,
    'min_samples_leaf': 2,
    'min_samples_split': 54,
}

In [None]:
f1_accum = []
accuracy_accum = []

skf = KFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]    
    X_test = X_data[test]
    
    y_train = y_data[train]
    y_test = y_data[test]


    # train
    fitted_model = RandomForestClassifier(**rf_params)
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

In [None]:
results = fitted_model.predict_proba(X_test)

In [48]:
clf = sklearn.multiclass.OneVsRestClassifier(RandomForestClassifier(**rf_params))

In [None]:
clf.fit(X_train, y_train)

In [None]:
results = clf.predict(X_test)

In [None]:
(results[0])

In [None]:
sklearn.preprocessing.normalize(results[0].reshape(-1,1), norm="l1", axis=0)

In [None]:
skf = KFold(n_splits=10, random_state=r_state)
for train, test in skf.split(X_data, y_data):

    X_train = X_data[train]    
    X_test = X_data[test]
    
    y_train = y_data[train]
    y_test = y_data[test]


    # train
    fitted_model = sklearn.multiclass.OneVsRestClassifier(RandomForestClassifier(**rf_params))
    fitted_model.fit(X_train, y_train)

    # Compute Precision-Recall 
    y_infer_local = fitted_model.predict(X_test)
    local_f1 = f1_score(y_test, y_infer_local, average="weighted")
    local_score = fitted_model.score(X_test, y_test)
    print("\tAccuracy:", local_score)
    print("\tF1:", local_f1)
    
    f1_accum.append(local_f1)
    accuracy_accum.append(local_score)

print("Accuracy:", np.mean(accuracy_accum))
print("F1:", np.mean(f1_accum))

In [None]:
prs = fitted_model.predict(X_test)
prs

In [None]:
for i, x in enumerate(prs.sum(axis=0)):
    print(i, indexer_inv[i], x)

In [50]:
clf = sklearn.multiclass.OneVsRestClassifier(
    RandomForestClassifier(**rf_params)
#     BernoulliNB(**nb_params)
)
clf.fit(X_data, y_data)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=32, max_features=113,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=54, min_weight_fraction_leaf=0.0,
            n_estimators=128, n_jobs=-1, oob_score=False,
            random_state=1337, verbose=0, warm_start=False),
          n_jobs=None)

In [51]:
y_test_labels = clf.predict(X_test_data)


In [52]:
all_rows = []
for tid, row_labels in zip([t["id"] for t in test_tweets], y_test_labels):
    
    row = [tid] + row_labels.tolist()
    all_rows.append(row)


id_to_cat_map = {y:x for x,y in category_to_label.items()}

df = pd.DataFrame(all_rows, columns=["tweet_id"] + [indexer_inv[i] for i in range(len(indexer_inv))])

df.to_csv("trec2019_test_results_run_fasttext_multi.csv", index=None)