Data format
===

The data itself is already pre-processed (`<s>, </s>` tags, `<unk>` tag, etc.). The punctuation is tokenized (one symbol = one token, no word-punctuation merged tokens). The is only one space symbol between every two adjacent tokens.

UPD: In the data there are common combination "@@ ". This is because the data has been preprocessed with BPE encoding (see: https://arxiv.org/abs/1508.07909 and https://github.com/rsennrich/subword-nmt) in order to reduce the vocabulary size.

In order to properly print a message you should make sure to do the following in Python:

[your string message here].replace(‘@@ ‘, ‘’)

NOTE: replace `@@space` by nothing. Don’t forget the [space]!

Broadly speaking, BPE encoding will split words into the most common n-gram to reduce the vocabulary size. The ‘@@ ’ you see are tokens to indicate there was a split. Thus to print the actual word you should replace all occurrences of '@@' to nothing.


Data fields are separated by one tab character.

    context - context phrase(s) for response, always human generated
    response - one phrase or a few phrases, may be from different speakers
    human-generated - flag if the response is generated by human

Data header: 'id\tcontext\tresponse\thuman-generated\n'

Submission format
===

ROC AUC score.

The file should contain a header and have the following format:

id,human-generated

1,1

8,0

9,1

10,1

We expect the solution file to have 524,342 predictions.

In [24]:
import numpy as np
import pandas as pd
from time import time
from xgboost import XGBClassifier
from IPython.display import clear_output
import pickle
from sklearn.metrics import roc_auc_score

In [2]:
chunksize = 100000
train_filename = 'data/train.txt'
test_filename = 'data/test.txt'

# Iteration and feature extraction

In [10]:
def run(function, filename, print_every = 5, chunks = None):
    start = time()
    for i, frame in enumerate(pd.read_csv(filename, chunksize = chunksize, delimiter = '\t')):
        frame = frame.replace({'@@ ': ''}, regex = True)
        
        function(frame)
        
        if i % print_every == 0:
            print('Chunk ' + str(i) + ' over: ' + str(round(time() - start)) + 's', end = '\t')
        if chunks is not None:
            if i >= chunks - 1:
                break
    print('\n' + str(i + 1) + ' chunks overall')

In [16]:
def extract_features(frame):
    frame_features = []
    
#     for index, row in frame.iterrows():
#     for row in frame.itertuples(): # requires 'dot access' to columns
#     zip is faster than itertuples and a whole lot faster than iterrows
    for context, response in zip(frame['context'], frame['response']):
        features = []
    
        # feature: response length
        features.append(float(len(response)))

        # feature: number of tokens in the response
        tokens = response.split(' ')
        features.append(float(len(tokens)))
        
        frame_features.append(features)

    return frame_features

In [37]:
def first_frame(print_filename):
    for frame in pd.read_csv(print_filename, chunksize = chunksize, delimiter = '\t'):
        frame = frame.replace({'@@ ': ''}, regex = True)
        break
    return frame

# Exploration

### Printing

In [34]:
frame = first_frame(train_filename)

In [35]:
chunk.shape

(100000, 4)

In [6]:
%load_ext line_profiler

In [7]:
all_features_list = []
all_labels_list = []
short_frame = frame[['context', 'response']]

In [18]:
%lprun -f extract_features extract_features(short_frame)

### Tags

In [9]:
def write_tags(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    
    for token_list in token_lists:
        tag_list = filter(lambda token: token[0] == '<' and token[-1] == '>', token_list)
        tag_set.update(tag_list)

In [60]:
tag_set = set()
run(write_tags)

Chunk 0: 2s	Chunk 5: 13s	Chunk 10: 25s	Chunk 15: 36s	Chunk 20: 48s	Chunk 25: 59s	Chunk 30: 70s	Chunk 35: 82s	Chunk 40: 93s	Chunk 45: 105s	Chunk 50: 116s	Chunk 55: 129s	Chunk 60: 140s	Chunk 65: 152s	Chunk 70: 164s	
72 chunks overall


In [67]:
# tags = list(tag_set)
tags = ['<at>',
        '<first_speaker>',
        '</d>',
        '<cont>',
        '<third_speaker>',
        '<number>',
        '<minor_speaker>',
        '<url>',
        '<second_speaker>',
        '<heart>']

# Training

In [42]:
def write_features_labels(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    labels = frame['human-generated'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append
    all_labels_list.extend(labels)

In [43]:
all_features_list = []
all_labels_list = []
run(write_features_labels, train_filename)
all_features = np.array(all_features_list)
all_labels = np.array(all_labels_list)

Chunk 0 over: 1s	Chunk 5 over: 9s	Chunk 10 over: 30s	Chunk 15 over: 53s	Chunk 20 over: 76s	Chunk 25 over: 99s	Chunk 30 over: 122s	Chunk 35 over: 147s	Chunk 40 over: 171s	Chunk 45 over: 201s	Chunk 50 over: 242s	Chunk 55 over: 292s	Chunk 60 over: 324s	Chunk 65 over: 398s	Chunk 70 over: 431s	
72 chunks overall


In [44]:
model = XGBClassifier()
model.fit(all_features, all_labels)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [46]:
pickle.dump(model, open('2_features.pickle.dat', 'wb'))

In [44]:
model = pickle.load(open('2_features.pickle.dat', 'rb'))

# Inference

In [47]:
def extract_features_ids(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    ids = frame['id'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append()
    all_ids_list.extend(ids)

In [51]:
all_features_list = []
all_ids_list = []
run(extract_features_ids, test_filename)
all_features = np.array(all_features_list)
all_ids = np.array(all_ids_list)

Chunk 0 over: 6s	Chunk 5 over: 16s	
6 chunks overall


In [52]:
predicted_labels = model.predict(all_features)

In [53]:
output = pd.DataFrame(np.concatenate(([all_ids], [predicted_labels]), axis = 0).T, columns = ['id', 'human-generated'])

In [54]:
output.to_csv('output.csv', index = False)

### AUC on train

In [38]:
frame = first_frame(train_filename)

In [42]:
truth = frame['human-generated'].values

In [40]:
short_frame = frame[['context', 'response']]
features = np.array(extract_features(short_frame))
pred = model.predict(features)

In [45]:
pred = model.predict(features)

In [47]:
truth.shape

(100000,)