Data format
===

The data itself is already pre-processed (`<s>, </s>` tags, `<unk>` tag, etc.). The punctuation is tokenized (one symbol = one token, no word-punctuation merged tokens). The is only one space symbol between every two adjacent tokens.

UPD: In the data there are common combination "@@ ". This is because the data has been preprocessed with BPE encoding (see: https://arxiv.org/abs/1508.07909 and https://github.com/rsennrich/subword-nmt) in order to reduce the vocabulary size.

In order to properly print a message you should make sure to do the following in Python:

[your string message here].replace(‘@@ ‘, ‘’)

NOTE: replace `@@space` by nothing. Don’t forget the [space]!

Broadly speaking, BPE encoding will split words into the most common n-gram to reduce the vocabulary size. The ‘@@ ’ you see are tokens to indicate there was a split. Thus to print the actual word you should replace all occurrences of '@@' to nothing.


Data fields are separated by one tab character.

    context - context phrase(s) for response, always human generated
    response - one phrase or a few phrases, may be from different speakers
    human-generated - flag if the response is generated by human

Data header: 'id\tcontext\tresponse\thuman-generated\n'

Submission format
===

ROC AUC score.

The file should contain a header and have the following format:

id,human-generated

1,1

8,0

9,1

10,1

We expect the solution file to have 524,342 predictions.

In [109]:
import numpy as np
import pandas as pd
from time import time
from xgboost import XGBClassifier
from IPython.display import clear_output
import pickle
from sklearn.metrics import roc_auc_score
import string
from collections import defaultdict, Counter
import nltk

In [154]:
nltk.download() # download corpora->words, stopwords

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [232]:
chunksize = 50000
train_filename = 'data/sampled_train.txt'
full_train_filename = 'data/train.txt'
test_filename = 'data/test.txt'
eval_filename = 'data/eval.txt'

# Iteration and feature extraction

In [99]:
def run(function, filename, print_every = 5, chunks = None):
    start = time()
    for i, frame in enumerate(pd.read_csv(filename, chunksize = chunksize, delimiter = '\t')):
        frame = frame.replace({'@@ ': ''}, regex = True)
        
        function(frame)
        
        if i % print_every == 0:
            print('Chunk ' + str(i) + ' over: ' + str(round(time() - start)) + 's', end = '\t')
        if chunks is not None:
            if i >= chunks - 1:
                break
    print('\n' + str(i + 1) + ' chunks overall')

In [157]:
vocab = set(w.lower() for w in nltk.corpus.words.words())
stopwords = nltk.corpus.stopwords.words('english')

In [207]:
def extract_features(frame):
    frame_features = []
    
#     for index, row in frame.iterrows():
#     for row in frame.itertuples(): # requires 'dot access' to columns
#     zip is faster than itertuples and a whole lot faster than iterrows
    for context, response in zip(frame['context'], frame['response']):
        features = []
        
        # context length and number of tokens in context are usually useless
        
        response_tokens = response.split(' ')
        context_tokens = context.split(' ')
    
        # feature: response length
        features.append(float(len(response)))

        # feature: number of tokens in the response
        features.append(float(len(response_tokens)))
        
        # features: number of words of length 1, 2, ..., 5
        # counters of length 6+ don't seem to have importance
        lens = Counter(map(len, response_tokens))
        for i in range(5):
            features.append(float(lens[i + 1]))
        
        # features: number of specific tokens from the list in the response
        tokens_to_count = ['<at>', '<number>', '!', '.']
        for token_to_count in tokens_to_count:
            features.append(float(response_tokens.count(token_to_count)))
            
        # features: includes apostrophes, is composed of english letters, is in vocabulary, is a stopword
        # for context and response
        apostrophe_counter = 0
        alpha_counter = 0
        vocab_counter = 0
        stop_counter = 0
        counters = [0] * 8
        for token in response_tokens:
            if "'" in token:
                counters[0] += 1
            if token.isalpha():
                counters[1] += 1
            if token in vocab:
                counters[2] += 1
            if token in stopwords:
                counters[3] += 1
        for token in context_tokens:
            if "'" in token:
                counters[4] += 1
            if token.isalpha():
                counters[5] += 1
            if token in vocab:
                counters[6] += 1
            if token in stopwords:
                counters[7] += 1
        for counter in counters:
            features.append(float(counter))
            
            
        all_tokens = set(context_tokens + response_tokens)
        shared_tokens = set(context_tokens).intersection(response_tokens)

        # feature: number of shared tokens between context and response
        features.append(float(len(shared_tokens)))

        # feature: sum of abs diffs in token shares
        diff = 0
        for token in all_tokens:
            token_stat = abs(float(context_tokens.count(token)) / len(context_tokens) -
                         float(response_tokens.count(token)) / len(response_tokens))
            diff += token_stat
        features.append(diff)
        
        frame_features.append(features)

    return frame_features

In [123]:
def first_frame(print_filename):
    for frame in pd.read_csv(print_filename, chunksize = chunksize, delimiter = '\t'):
        frame = frame.replace({'@@ ': ''}, regex = True)
        break
    return frame

# Exploration

### Printing

In [145]:
frame = first_frame(train_filename)

In [146]:
frame.shape

(50000, 4)

In [150]:
for i in range(50):
    print('Human' if frame.iloc[i]['human-generated'] == 1 else 'Generated')
    print(frame.iloc[i].context)
    print('')
    print(frame.iloc[i].response)
    print('=====')

Generated
<first_speaker> sounds like many of my aie peeps had a crappy day ! hugs to all

<second_speaker> <at> i 'll post sex so might fall asleep . have a good nights sleep too i can do it all day * sunbathing on the sofa instead screams *
=====
Generated
<first_speaker> posting a video soon about the new album . excited nervous to see what you kittens think . any guesses to what the album name is ?

<second_speaker> <at> what is it ?
=====
Human
<first_speaker> so there 's a mouse at work .

<second_speaker> <at> wtf it betta b the <number> w <number> legs & a ponytail ! not <number> legs w a long tail ! coughing umm yea can 't make it in <number> day !
=====
Generated
<first_speaker> in new york concrete jungle wet dream tomato . <second_speaker> <at> go to bed la boy .

<first_speaker> <at> shit seems like that many drinks are in the gym ! ! !
=====
Human
<first_speaker> official event hashtag is soslam <at> <at>

<second_speaker> <at> <at> i 'm all over it ! at dulles , making m

### Feature extraction profiling

In [6]:
%load_ext line_profiler

In [7]:
all_features_list = []
all_labels_list = []
short_frame = frame[['context', 'response']]

In [18]:
%lprun -f extract_features extract_features(short_frame)

### Class balance

In [60]:
def write_counters(frame):
    global human_counter, generated_counter
    labels = frame['human-generated'].values
    labels_sum = labels.sum()
    human_counter += labels_sum
    generated_counter += frame.shape[0] - labels_sum

In [61]:
human_counter = 0
generated_counter = 0
run(write_counters, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 9s	Chunk 15 over: 13s	Chunk 20 over: 16s	Chunk 25 over: 21s	Chunk 30 over: 25s	Chunk 35 over: 29s	Chunk 40 over: 33s	Chunk 45 over: 37s	Chunk 50 over: 41s	Chunk 55 over: 45s	Chunk 60 over: 49s	Chunk 65 over: 53s	Chunk 70 over: 57s	Chunk 75 over: 61s	Chunk 80 over: 65s	Chunk 85 over: 68s	Chunk 90 over: 72s	Chunk 95 over: 76s	Chunk 100 over: 81s	Chunk 105 over: 85s	Chunk 110 over: 89s	Chunk 115 over: 92s	Chunk 120 over: 97s	Chunk 125 over: 101s	Chunk 130 over: 111s	Chunk 135 over: 124s	Chunk 140 over: 138s	
144 chunks overall


In [62]:
print(human_counter, generated_counter)

3595488 3595488


### Tags

In [53]:
def write_tags(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        tag_list = list(filter(lambda token: token[0] == '<' and token[-1] == '>', token_list))
        tag_set = set(tag_list)
        for tag in tag_set:
            tag_count = tag_list.count(tag)
            if labels[i] == 0:
                tag_dict_generated[tag]['occ'] += 1
                tag_dict_generated[tag][tag_count] += 1
                tag_dict_generated[tag]['sum'] += tag_count
            else:
                tag_dict_human[tag]['occ'] += 1
                tag_dict_human[tag][tag_count] += 1
                tag_dict_human[tag]['sum'] += tag_count

In [55]:
tag_dict_human = defaultdict(lambda: defaultdict(int))
tag_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_tags, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 18s	Chunk 10 over: 28s	Chunk 15 over: 34s	Chunk 20 over: 40s	Chunk 25 over: 46s	Chunk 30 over: 52s	Chunk 35 over: 59s	Chunk 40 over: 65s	Chunk 45 over: 71s	Chunk 50 over: 77s	Chunk 55 over: 84s	Chunk 60 over: 90s	Chunk 65 over: 96s	Chunk 70 over: 102s	Chunk 75 over: 108s	Chunk 80 over: 114s	Chunk 85 over: 120s	Chunk 90 over: 126s	Chunk 95 over: 133s	Chunk 100 over: 148s	Chunk 105 over: 164s	Chunk 110 over: 179s	Chunk 115 over: 192s	Chunk 120 over: 205s	Chunk 125 over: 219s	Chunk 130 over: 233s	Chunk 135 over: 246s	Chunk 140 over: 259s	
144 chunks overall


Tag statistics saved in analysis_results/tag_analysis.txt

In [110]:
# for token_to_count in tokens_to_count:
#     print('trying ' + token_to_count)
#     model = train(train_filename)
#     auc = estimate_auc(model)
#     print('AUC ' + str(round(auc, 4)))

### Punctuation

In [66]:
def write_punct(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        punct_list = list(filter(lambda token: len(token) == 1 and token in string.punctuation, token_list))
        punct_set = set(punct_list)
        for punct in punct_set:
            punct_count = punct_list.count(punct)
            if labels[i] == 0:
                punct_dict_generated[punct]['occ'] += 1
                punct_dict_generated[punct][punct_count] += 1
                punct_dict_generated[punct]['sum'] += punct_count
            else:
                punct_dict_human[punct]['occ'] += 1
                punct_dict_human[punct][punct_count] += 1
                punct_dict_human[punct]['sum'] += punct_count

In [68]:
punct_dict_human = defaultdict(lambda: defaultdict(int))
punct_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_punct, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 12s	Chunk 15 over: 19s	Chunk 20 over: 25s	Chunk 25 over: 31s	Chunk 30 over: 38s	Chunk 35 over: 45s	Chunk 40 over: 52s	Chunk 45 over: 58s	Chunk 50 over: 65s	Chunk 55 over: 71s	Chunk 60 over: 77s	Chunk 65 over: 82s	Chunk 70 over: 89s	Chunk 75 over: 95s	Chunk 80 over: 101s	Chunk 85 over: 107s	Chunk 90 over: 113s	Chunk 95 over: 119s	Chunk 100 over: 125s	Chunk 105 over: 131s	Chunk 110 over: 140s	Chunk 115 over: 156s	Chunk 120 over: 167s	Chunk 125 over: 173s	Chunk 130 over: 179s	Chunk 135 over: 185s	Chunk 140 over: 191s	
144 chunks overall


Punctuation statistics saved in analysis_results/punc_analysis.txt

# Training

In [233]:
def write_features_labels(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    labels = frame['human-generated'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append
    all_labels_list.extend(labels)

In [234]:
def get_eval():
    frame = first_frame(eval_filename)
    
    short_frame = frame[['context', 'response']]
    features = np.array(extract_features(short_frame))
    
    truth = frame['human-generated'].values
    
    return (features, truth)

In [235]:
def get_model(all_features, all_labels):
    eval_features, eval_labels = get_eval()
    print('Got data for early stopping')
    
    model = XGBClassifier(max_depth = 7, n_estimators = 200)
    model = model.fit(all_features, all_labels, eval_set = [(eval_features, eval_labels)],
                          eval_metric = 'auc', early_stopping_rounds = 5)
    return model

In [None]:
# def auc # custom auc with rounding for correct early stopping

In [236]:
def estimate_auc(model):
    features, truth = get_eval()
    pred = model.predict(features)
    return roc_auc_score(truth, pred)

In [237]:
def train(filename):
    global all_features_list, all_labels_list
    
    all_features_list = []
    all_labels_list = []
    run(write_features_labels, train_filename)
    all_features = np.array(all_features_list)
    all_labels = np.array(all_labels_list)
    
    model = get_model(all_features, all_labels)
    
    return all_features, all_labels, model

In [238]:
model = get_model(all_features, all_labels)

Got data for early stopping
[0]	validation_0-auc:0.707191
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.714476
[2]	validation_0-auc:0.715269
[3]	validation_0-auc:0.717607
[4]	validation_0-auc:0.719292
[5]	validation_0-auc:0.720185
[6]	validation_0-auc:0.721829
[7]	validation_0-auc:0.724163
[8]	validation_0-auc:0.725393
[9]	validation_0-auc:0.727011
[10]	validation_0-auc:0.728322
[11]	validation_0-auc:0.728999
[12]	validation_0-auc:0.730659
[13]	validation_0-auc:0.731461
[14]	validation_0-auc:0.73246
[15]	validation_0-auc:0.732988
[16]	validation_0-auc:0.733588
[17]	validation_0-auc:0.734323
[18]	validation_0-auc:0.734759
[19]	validation_0-auc:0.735415
[20]	validation_0-auc:0.735683
[21]	validation_0-auc:0.736361
[22]	validation_0-auc:0.736891
[23]	validation_0-auc:0.737347
[24]	validation_0-auc:0.737745
[25]	validation_0-auc:0.738052
[26]	validation_0-auc:0.738321
[27]	validation_0-auc:0.738598
[28]	validation_0-auc:0.738785
[29]	validation_0-auc

In [219]:
all_features, all_labels, model = train(train_filename)

Chunk 0 over: 26s	Chunk 5 over: 157s	Chunk 10 over: 286s	
14 chunks overall
Got data for early stopping. Now fitting...


TypeError: __init__() got an unexpected keyword argument 'eval_set'

In [240]:
pickle.dump(model, open('6_pruned_early_stopping.pickle.dat', 'wb'))
# model = pickle.load(open('44_features.pickle.dat', 'rb'))

In [247]:
estimate_auc(model)

0.67939750103645391

In [248]:
model.feature_importances_

array([ 0.120897  ,  0.03973741,  0.05414512,  0.04833556,  0.03904026,
        0.03816883,  0.02666589,  0.01748678,  0.01289723,  0.03410213,
        0.02585255,  0.04153837,  0.0411317 ,  0.04984605,  0.05832801,
        0.02614303,  0.06588044,  0.04409458,  0.04990414,  0.06338233,
        0.10242259], dtype=float32)

In [None]:
# min_child_weight, gamma, max_depth => model complexity
# n_estimators

In [None]:
# TODO: initialize data inside for the functions to run and make them return values
# TODO: reduce the number of features and store them in bytes, train on the whole dataset
# TODO: compare distribution with context, implement bag of words and ngrams
# TODO: grid search to tune XGBoost
# TODO: http://www.nltk.org/book/ch05.html syntactical tagging including n-gram

# Inference

In [241]:
def extract_features_ids(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    ids = frame['id'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append()
    all_ids_list.extend(ids)

In [243]:
all_features_list = []
all_ids_list = []
run(extract_features_ids, test_filename)
all_features = np.array(all_features_list)
all_ids = np.array(all_ids_list)

Chunk 0 over: 32s	Chunk 5 over: 165s	Chunk 10 over: 277s	
11 chunks overall


In [244]:
predicted_labels = model.predict(all_features)

In [245]:
output = pd.DataFrame(np.concatenate(([all_ids], [predicted_labels]), axis = 0).T, columns = ['id', 'human-generated'])

In [246]:
output.to_csv('output.csv', index = False)