In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import feather
import pickle

from sklearn.metrics import log_loss, f1_score

In [2]:
with open('topics.bin', 'rb') as f:
    all_topics, selected_topics = pickle.load(f)

In [3]:
df_train = feather.read_dataframe('df_train.feather')
df_val = feather.read_dataframe('df_val.feather')
df_test = feather.read_dataframe('df_test.feather')

In [4]:
def get_y(df, topics):
    topic_idx = {t: i for (i, t) in enumerate(topics)}
    y = np.zeros((len(df), len(topics)), dtype='uint8')

    for idx, topics in enumerate(df.topics):
        for t in topics.split(','):
            if t in topic_idx:
                y[idx, topic_idx[t]] = 1
    return y

In [5]:
from sklearn.metrics import log_loss, f1_score

In [6]:
df_all = pd.concat([df_train, df_val]).reset_index(drop=1)

In [7]:
y_train = get_y(df_all, selected_topics)

In [8]:
from sklearn.cross_validation import KFold
cv = KFold(n=len(df_all), n_folds=3, shuffle=True, random_state=1)



In [9]:
df_all['fold'] = 0

In [10]:
for i, (_, val_idx) in enumerate(cv):
    print(i)
    df_all.loc[val_idx, 'fold'] = i

0
1
2


In [11]:
df_all.fold.value_counts()

1    59117
0    59117
2    59116
Name: fold, dtype: int64

In [14]:
from elasticsearch import Elasticsearch, helpers
es_host = '172.17.0.2'
es = Elasticsearch(host=es_host)

In [17]:
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Mapping, String, Nested, Integer, Boolean
from elasticsearch_dsl import analyzer, tokenizer

In [20]:
con = connections.create_connection(host=es_host)

In [24]:
mapping = Mapping('dsc_instability')
mapping.field('body', String(analyzer='standard'))
mapping.field('fold', Integer(index='not_analyzed'))
mapping.save('dsc')

Mapping('dsc_instability')

In [28]:
import itertools 

def chunk_iterator(iterator, size):
    while 1:
        batch = list(itertools.islice(iterator, size))
        if batch:
            yield batch
        else:
            break

In [38]:
df_all['idx'] = np.arange(len(df_all))
df_test['idx'] = len(df_all) + np.arange(len(df_test))

In [35]:
for chunk in tqdm(chunk_iterator(df_all.itertuples(), 100)):
    actions = []

    for t in chunk:
        record = {'body': t.body, 'fold': int(t.fold)}
        action = {'_id': t.idx, '_index': 'dsc', '_type': 'dsc_instability', '_source': record}
        actions.append(action)

    helpers.bulk(es, actions)

1774it [07:03,  2.67it/s]


In [36]:
for chunk in tqdm(chunk_iterator(df_test.itertuples(), 100)):
    actions = []

    for t in chunk:
        record = {'body': t.body, 'fold': 3}
        action = {'_id': t.idx, '_index': 'dsc', '_type': 'dsc_instability', '_source': record}
        actions.append(action)

    helpers.bulk(es, actions)

76it [00:20,  3.66it/s]


In [52]:
def find_similar(idx, folds, limit=10):
    query = {
        'query': {
            'filtered': {
                'query': {
                    'more_like_this': {
                        'like': {
                            '_index': 'dsc',
                            '_type': 'dsc_instability',
                            '_id': int(idx),
                        },
                        'max_query_terms': 30,
                        'fields': ['body'],
                    }
                }
            }
        },
        'filter': {
            'bool': {
                'must': {
                    'terms': {
                        'fold': folds,
                    },
                },
                
            }
        },
        'fields': ['_id'],
        'size': limit,
    }

    res = es.search(index='dsc', doc_type='dsc_instability', body=query)
    hits = res['hits']['hits']

    return [(int(d['_id']), d['_score']) for d in hits]

In [49]:
df_all.iloc[0]

key                                2012a_TrainingData_04253
date                                    2012-05-21 00:00:00
body      One month after relinquishing control of night...
topics                                           drugstrade
fold                                                      2
idx                                                       0
Name: 0, dtype: object

In [55]:
folds_opp = {0: [1, 2], 1: [0, 2], 2: [0, 1]}

In [121]:
knns = []

for t in tqdm(df_all.itertuples()):
    idx = t.idx
    folds = folds_opp[t.fold]
    closest = find_similar(idx, folds, limit=50)

    knns.append(closest)

177350it [2:38:19, 18.67it/s]


In [122]:
knns_test = []

for t in tqdm(df_test.itertuples()):
    idx = t.idx
    folds = [0, 1, 2]
    closest = find_similar(idx, folds, limit=50)

    knns_test.append(closest)

7581it [07:23, 17.11it/s]


In [125]:
with open('knn_50_train_test.bin', 'wb') as f:
    pickle.dump((knns, knns_test), f)

In [123]:
id = 8543
print(df_all.iloc[id].topics)

a1 = knns[id]

idx = [i for (i, _) in a1]
sc = [s for (_, s) in a1]

df_all.iloc[idx]

religion


Unnamed: 0,key,date,body,topics,fold,idx
96069,2013b_TrainingData_08934,2013-09-15,"As Birmingham Metropolitan College, one of Bri...",religion,0,96069
24573,2013b_TrainingData_23810,2013-09-16,Do you feel we simply don't&nbsp;have enough d...,religion,2,24573
137166,2013b_TrainingData_14936,2013-09-13,A college has abandoned its ban on Muslim face...,"protest,religion",0,137166
119936,2013b_TrainingData_40685,2013-09-17,Imagine that I am sitting outside a cafe in Pa...,"humanrights,religion",2,119936
110168,2006b_TrainingData_07599,2006-10-16,The following correction was printed in the Gu...,"immigration,religion",0,110168
102899,2009a_TrainingData_49322,2009-06-03,You ask me if I regret anything I've written. ...,egypt,0,102899
6036,2006b_TrainingData_00874,2006-05-10,Commons leader Jack Straw revealed today that ...,"immigration,religion",2,6036
152103,2006b_TrainingData_19691,2006-08-10,Jack Straw was looking increasingly isolated y...,religion,0,152103
60958,2013b_TrainingData_51318,2013-07-22,When Youssra's three-and-a-half-year-old son s...,"france,religion",0,60958
116764,2006b_TrainingData_11355,2006-10-20,"The Tory leader, David Cameron, has warned pol...","immigration,religion",2,116764


In [124]:
y_train = get_y(df_all, selected_topics)

In [142]:
pred_weighted = []

zeros = np.zeros(y_train.shape[1], dtype='uint8')
k = 3

for a1 in tqdm(knns):
    if len(a1) == 0:
        pred_weighted.append(zeros)
        continue

    idx = [i for (i, _) in a1[:k]]
    sc = [[s] for (_, s) in a1[:k]]

    y_knn = y_train[idx]
    sol = y_knn.sum(axis=0)
    sol_w = (y_knn * sc).sum(axis=0)

    pred_weighted.append(sol_w)

pred_weighted = np.array(pred_weighted, dtype='float32')

100%|██████████| 177350/177350 [00:07<00:00, 23179.98it/s]


In [136]:
f1s = []

for t in range(1, k + 1):
    f1 = f1_score(y_train, pred_total >= t, average='micro')
    print('t=%s, f1=%.4f' % (t, f1))
    f1s.append((f1, t))

t=1, f1=0.6225
t=2, f1=0.7020
t=3, f1=0.6185


In [139]:
f1s = []

for t in np.linspace(0, 1, 11):
    f1 = f1_score(y_train, pred_weighted >= t, average='micro')
    print('t=%s, f1=%.4f' % (t, f1))
    f1s.append((f1, t))

t=0.0, f1=0.0205
t=0.1, f1=0.6225
t=0.2, f1=0.6263
t=0.3, f1=0.6448
t=0.4, f1=0.6690
t=0.5, f1=0.6905
t=0.6, f1=0.7066
t=0.7, f1=0.7153
t=0.8, f1=0.7177
t=0.9, f1=0.7142
t=1.0, f1=0.7064


In [143]:
pred_weighted_test = []

zeros = np.zeros(y_train.shape[1], dtype='uint8')
k = 3

for a1 in tqdm(knns_test):
    if len(a1) == 0:
        pred_weighted_test.append(zeros)
        continue

    idx = [i for (i, _) in a1[:k]]
    sc = [[s] for (_, s) in a1[:k]]

    y_knn = y_train[idx]
    sol_w = (y_knn * sc).sum(axis=0)

    pred_weighted_test.append(sol_w)

pred_weighted_test = np.array(pred_weighted_test, dtype='float32')

100%|██████████| 7581/7581 [00:00<00:00, 22034.81it/s]


In [151]:
with open('knn3_preds_all.bin', 'wb') as f:
    pickle.dump((pred_weighted, pred_weighted_test), f)