In [1]:
import warnings; warnings.filterwarnings('ignore')

In [2]:
from growing_instability_lib import *

Using TensorFlow backend.


In [3]:
sample_sub = pd.read_csv('../data/sampleSubmission.csv')
topics = sorted(set(sample_sub.columns.difference(['id'])))

topic2actual = {}
for i in sample_sub.columns:
    if 'id' == i:
        continue
    topic2actual[i] = segment(i)
    
target_columns = sorted(topics)
len(topics)

160

In [4]:
wvmodel = Word2Vec.load('corpus/train_body_data-with_labels_False-retain_special_chars_False.with_test_data.csv.w2v.model')

In [16]:
fsmodel = fasttext.load_model('corpus/train_body_data-with_labels_False-retain_special_chars_False.with_test_data.csv.fasttext.model.bin')

In [5]:
topic2actual

{'activism': ['activism'],
 'afghanistan': ['afghanistan'],
 'aid': ['aid'],
 'algerianhostagecrisis': ['algerian', 'hostage', 'crisis'],
 'alqaida': ['al', 'qaida'],
 'alshabaab': ['al', 'shaba', 'ab'],
 'antiwar': ['antiwar'],
 'arabandmiddleeastprotests': ['arab', 'and', 'middle', 'east', 'protests'],
 'armstrade': ['arms', 'trade'],
 'australianguncontrol': ['australian', 'gun', 'control'],
 'australiansecurityandcounterterrorism': ['australian',
  'security',
  'and',
  'counterterrorism'],
 'bastilledaytruckattack': ['bastille', 'day', 'truck', 'attack'],
 'belgium': ['belgium'],
 'berlinchristmasmarketattack': ['berlin', 'christmas', 'market', 'attack'],
 'bigdata': ['big', 'data'],
 'biometrics': ['biometrics'],
 'bokoharam': ['boko', 'haram'],
 'bostonmarathonbombing': ['boston', 'marathon', 'bombing'],
 'britisharmy': ['british', 'army'],
 'brusselsattacks': ['brussels', 'attacks'],
 'cameroon': ['cameroon'],
 'carers': ['carers'],
 'charliehebdoattack': ['charlie', 'hebdo', 

# Perform exploratory unsupervised discovery of unseen topics

In [6]:
trainingY = pd.read_hdf('training_data_wv_fs_no_stopwords.hdf', 'trainingY')

In [8]:
training_topics_freq = trainingY.sum()
training_topics_freq.head()

activism                     NaN
afghanistan              10032.0
aid                       1891.0
algerianhostagecrisis       58.0
alqaida                   3194.0
dtype: float64

In [9]:
unseen_topics = set(topics).intersection(training_topics_freq.ix[training_topics_freq.isnull()].index)

In [11]:
unseen_topics

{'activism',
 'bastilledaytruckattack',
 'berlinchristmasmarketattack',
 'brusselsattacks',
 'charliehebdoattack',
 'francetrainattack',
 'munichshooting',
 'orlandoterrorattack',
 'parisattacks',
 'peaceandreconciliation',
 'sanbernardinoshooting',
 'tunisiaattack2015',
 'turkeycoupattempt',
 'zikavirus'}

In [181]:
def transform_tfidf_word2vec(tokens, stopwords=[]):
    global wvmodel
    global tfidf
    # This requires wvmodel to be present in the namespace.
    wv_feature_vec = tokens.map(
        lambda x: [w for w in x.split() if (w not in stopwords and w in wvmodel.wv.vocab)]
    ).map(
        lambda x: tfidf[dictionary.doc2bow(x)]
    ).map(
        lambda x: np.array([wvmodel[dictionary.id2token[id]] * w for id, w in x]).mean(axis=0) if len(x) > 0 else np.nan
    )

    return wv_feature_vec


def transform_fasttext(tokens, stopwords=[]):
    global fsmodel
    # This requires fsmodel to be present in the namespace.
    fs_feature_vec = tokens.map(
        lambda x: [w for w in x.split() if (w not in stopwords)]
    ).map(lambda x: np.array([fsmodel[w] for w in x]).mean(axis=0) if len(x) > 0 else np.nan)

    return fs_feature_vec


def transform_word2vec(tokens, stopwords=[]):
    global wvmodel
    # This requires wvmodel to be present in the namespace.
    wv_feature_vec = tokens.map(
        lambda x: [w for w in x.split() if (w not in stopwords and w in wvmodel.wv.vocab)]
    ).map(lambda x: np.array([wvmodel[w] for w in x]).mean(axis=0) if len(x) > 0 else np.nan)

    return wv_feature_vec


def parallel_generate_word_vectors(samp, transformer, stopwords, batch, num_proc):
    with Parallel(n_jobs=num_proc) as parallel:
        dataset = []
        is_break = False
        i = 0

        while not is_break:
            payload = []

            for j in xrange(num_proc):
                t_df = samp[(i + j) * batch: (i + 1 + j) * batch]

                if t_df.empty:
                    is_break = True
                    continue

                payload.append(
                    delayed(transformer)(
                        t_df, stopwords
                    )
                )

            print('Current batch in main thread: {}'.format((i + j) * batch))

            if payload:
                results = parallel(payload)
                dataset.extend(results)
                i += num_proc

    return pd.concat(dataset)


def extract_features_for(df, min_batch=2000, stopwords=[], num_proc=7):
    df_tokens = transform_text(df)
    
    batch = min(df_tokens.shape[0] / num_proc, min_batch)

    print('Computing fs features...')
    fvec = parallel_generate_word_vectors(df_tokens, transform_fasttext, stopwords=stopwords, batch=batch, num_proc=num_proc)

    print('Computing wv features...')
    wvec = parallel_generate_word_vectors(df_tokens, transform_word2vec, stopwords=stopwords, batch=batch, num_proc=num_proc)

    return wvec, fvec

In [12]:
with open('../data/TestData.json') as fl:
    data = json.load(fl)
    test_df = pd.DataFrame(data['TestData']).T
    del(data)

In [132]:
test_transformed_text = transform_text(test_df)

In [18]:
%%time
test_wvec, test_fvec = extract_features_for(test_df)

Computing fs features...
Current batch in main thread: 6498
Current batch in main thread: 14079
Computing wv features...
Current batch in main thread: 6498
Current batch in main thread: 14079
CPU times: user 4.14 s, sys: 440 ms, total: 4.58 s
Wall time: 48.6 s


In [90]:
tpa = pd.DataFrame(topic2actual.items(), columns=['topics', 'actual'])
tpa['wv'] = transform_word2vec(tpa.actual.map(lambda x: ' '.join(x)))
tpa['fs'] = transform_fasttext(tpa.actual.map(lambda x: ' '.join(x)))
unseen_topics2vec = tpa[tpa.topics.isin(unseen_topics)]
unseen_topics2vec = unseen_topics2vec.set_index('topics')

In [91]:
null_index = test_wvec[test_wvec.isnull()].index
if any(null_index):
    test_wvec.ix[null_index] = [[0] * wvmodel.vector_size] * 3
    
null_index = test_fvec[test_fvec.isnull()].index
if any(null_index):
    test_fvec.ix[null_index] = [[0] * fsmodel.dim] * 3

In [92]:
%%time
sim = cosine_similarity(
    np.vstack(test_fvec),
    np.vstack(unseen_topics2vec.fs),
)

for i, j in enumerate(sim.argmax(axis=0)):
    print i, unseen_topics2vec.index[i], j

CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 29.9 ms


In [93]:
for i, j in enumerate(sim.argmax(axis=0)):
    print i, unseen_topics2vec.index[i], j

0 berlinchristmasmarketattack 7489
1 munichshooting 6254
2 bastilledaytruckattack 78
3 brusselsattacks 518
4 activism 396
5 peaceandreconciliation 6853
6 parisattacks 5204
7 orlandoterrorattack 518
8 francetrainattack 3187
9 zikavirus 4489
10 charliehebdoattack 45
11 turkeycoupattempt 3585
12 sanbernardinoshooting 2958
13 tunisiaattack2015 518


In [154]:
from gensim.corpora import Dictionary
from gensim.models import tfidfmodel
from gensim.similarities import MatrixSimilarity

In [133]:
%%time
dictionary = Dictionary(test_transformed_text.str.split())
dictionary.id2token = {j: i for i, j in dictionary.token2id.items()}

%%time
corpus = [dictionary.doc2bow(i) for i in test_transformed_text.str.split()]

%%time
tfidf = tfidfmodel.TfidfModel(corpus)

CPU times: user 4.33 s, sys: 56 ms, total: 4.39 s
Wall time: 4.36 s


In [138]:
%%time
corpus = [dictionary.doc2bow(i) for i in test_transformed_text.str.split()]

CPU times: user 3.32 s, sys: 112 ms, total: 3.43 s
Wall time: 3.38 s


In [139]:
%%time
tfidf = tfidfmodel.TfidfModel(corpus)

CPU times: user 428 ms, sys: 24 ms, total: 452 ms
Wall time: 391 ms


In [184]:
%%time
test_tfdifwvec = transform_tfidf_word2vec(test_transformed_text)

null_index = test_tfdifwvec[test_tfdifwvec.isnull()].index
if any(null_index):
    test_tfdifwvec.ix[null_index] = [[0] * wvmodel.vector_size] * 3

CPU times: user 15.4 s, sys: 352 ms, total: 15.8 s
Wall time: 18.9 s


In [195]:
%%time
sim = cosine_similarity(
    np.vstack(test_tfdifwvec),
    np.vstack(transform_tfidf_word2vec(unseen_topics2vec.actual.map(lambda x: ' '.join(x)))),
)

for i, j in enumerate(sim.argmax(axis=0)):
    print i, unseen_topics2vec.index[i], j

0 berlinchristmasmarketattack 7489
1 munichshooting 2129
2 bastilledaytruckattack 6128
3 brusselsattacks 3300
4 activism 6796
5 peaceandreconciliation 6853
6 parisattacks 3514
7 orlandoterrorattack 5885
8 francetrainattack 1246
9 zikavirus 4489
10 charliehebdoattack 45
11 turkeycoupattempt 6163
12 sanbernardinoshooting 1167
13 tunisiaattack2015 805
CPU times: user 80 ms, sys: 36 ms, total: 116 ms
Wall time: 269 ms


In [241]:
sim[:, 0].argsort()[::-1][:50]

array([7489, 3681, 7491, 6267, 4253, 2487, 7490, 7513, 2441, 6818, 6820,
       6892, 7503, 7521, 7535, 3233, 4159, 4166, 7493, 7544, 4183, 6297,
       1032, 7574, 3473, 1121, 6268, 7494, 1757, 7549, 6223, 4178, 6592,
       6319, 7148, 4031, 5159, 7497, 4117,   16, 4122, 4264, 5068, 7515,
       6275, 5279, 5561, 3066,  492, 2297])

In [240]:
sim[:, 0][sim[:, 0].argsort()[::-1][:50]]

array([ 0.6800879 ,  0.68005754,  0.67043153,  0.66684164,  0.66186854,
        0.64642356,  0.64510033,  0.64174383,  0.64172735,  0.63701938,
        0.6367818 ,  0.6356611 ,  0.63448659,  0.63333548,  0.63300472,
        0.63291737,  0.63180659,  0.62803853,  0.62801635,  0.62676484,
        0.62635928,  0.62551549,  0.62338105,  0.62269482,  0.62196284,
        0.62180046,  0.62167733,  0.62059868,  0.61993994,  0.61847115,
        0.61727401,  0.6170171 ,  0.61660353,  0.61580356,  0.61552269,
        0.61528071,  0.61422024,  0.61406879,  0.61374389,  0.61350205,
        0.61338447,  0.61201894,  0.61177827,  0.611455  ,  0.61139836,
        0.61095171,  0.61090377,  0.6105257 ,  0.61028394,  0.61000594])

In [254]:
test_df.iloc[7490].bodyText



In [253]:
test_df[test_df.bodyText.str.contains('Zika')].bodyText.iloc[2]

u'Violent protests at Trump rally in California With the tenor of the general election campaign now clearly defined, Hillary Clinton delivered a stinging rebuke to Donald Trump\u2019s claim to understand international affairs on Thursday. Trump, meanwhile, described Clinton\u2019s clear-headed foreign policy address \u2013 in which she described him as too unstable to be trusted with nuclear launch codes and warned he would take the country down a \u201ctruly dangerous path\u201d \u2013 as \u201cpathetic\u201d and said she should be in jail. Violence later erupted at a Trump rally in California. Reports from San Jose described anti-Trump demonstrators chasing \u2013 and in some cases punching and attacking \u2013 departing Trump supporters, some of whom appeared intent on provoking and fighting. Riot police were deployed to control the crowd. The mayor of San Jose, Sam Liccardo, blamed Trump. \u201cWe don\u2019t appreciate [anyone] utilizing campaign tactics of demagoguery,\u201d he sa

In [245]:
ssim = cosine_similarity(
    np.vstack(test_tfdifwvec)[4489],
    np.vstack(test_tfdifwvec),
)



In [250]:
ssim.argsort()[0][::-1][:20]

array([4489, 7543, 4760, 4881,  471, 5557,  373,  824, 4456, 4549,  352,
        469,  749, 1906, 1092, 6609, 2255,  950, 2888, 1512])

In [252]:
topics

['activism',
 'afghanistan',
 'aid',
 'algerianhostagecrisis',
 'alqaida',
 'alshabaab',
 'antiwar',
 'arabandmiddleeastprotests',
 'armstrade',
 'australianguncontrol',
 'australiansecurityandcounterterrorism',
 'bastilledaytruckattack',
 'belgium',
 'berlinchristmasmarketattack',
 'bigdata',
 'biometrics',
 'bokoharam',
 'bostonmarathonbombing',
 'britisharmy',
 'brusselsattacks',
 'cameroon',
 'carers',
 'charliehebdoattack',
 'chemicalweapons',
 'clusterbombs',
 'cobra',
 'conflictanddevelopment',
 'controversy',
 'criminaljustice',
 'cybercrime',
 'cyberwar',
 'darknet',
 'dataprotection',
 'debate',
 'defence',
 'deflation',
 'drones',
 'drugs',
 'drugspolicy',
 'drugstrade',
 'earthquakes',
 'ebola',
 'economy',
 'egypt',
 'encryption',
 'energy',
 'espionage',
 'ethics',
 'europeanarrestwarrant',
 'europeancourtofhumanrights',
 'events',
 'extradition',
 'famine',
 'farright',
 'firefighters',
 'forensicscience',
 'france',
 'francetrainattack',
 'freedomofspeech',
 'genevaconv