Data format
===

The data itself is already pre-processed (`<s>, </s>` tags, `<unk>` tag, etc.). The punctuation is tokenized (one symbol = one token, no word-punctuation merged tokens). The is only one space symbol between every two adjacent tokens.

UPD: In the data there are common combination "@@ ". This is because the data has been preprocessed with BPE encoding (see: https://arxiv.org/abs/1508.07909 and https://github.com/rsennrich/subword-nmt) in order to reduce the vocabulary size.

In order to properly print a message you should make sure to do the following in Python:

[your string message here].replace(‘@@ ‘, ‘’)

NOTE: replace `@@space` by nothing. Don’t forget the [space]!

Broadly speaking, BPE encoding will split words into the most common n-gram to reduce the vocabulary size. The ‘@@ ’ you see are tokens to indicate there was a split. Thus to print the actual word you should replace all occurrences of '@@' to nothing.


Data fields are separated by one tab character.

    context - context phrase(s) for response, always human generated
    response - one phrase or a few phrases, may be from different speakers
    human-generated - flag if the response is generated by human

Data header: 'id\tcontext\tresponse\thuman-generated\n'

Submission format
===

ROC AUC score.

The file should contain a header and have the following format:

id,human-generated

1,1

8,0

9,1

10,1

We expect the solution file to have 524,342 predictions.

In [1]:
import numpy as np
import pandas as pd
from time import time
from xgboost import XGBClassifier
from IPython.display import clear_output
import pickle
from sklearn.metrics import roc_auc_score
import string
from collections import defaultdict, Counter
import nltk



In [2]:
# nltk.download() # download corpora->words, stopwords

In [3]:
chunksize = 50000
train_filename = 'data/sampled_train.txt'
full_train_filename = 'data/train.txt'
test_filename = 'data/test.txt'
eval_filename = 'data/eval.txt'

# Iteration and feature extraction

In [4]:
def run(function, filename, print_every = 5, chunks = None):
    start = time()
    for i, frame in enumerate(pd.read_csv(filename, chunksize = chunksize, delimiter = '\t')):
        frame = frame.replace({'@@ ': ''}, regex = True)
        
        function(frame)
        
        if i % print_every == 0:
            print('Chunk ' + str(i) + ' over: ' + str(round(time() - start)) + 's', end = '\t')
        if chunks is not None:
            if i >= chunks - 1:
                break
    print('\n' + str(i + 1) + ' chunks overall')

In [5]:
vocab = set(w.lower() for w in nltk.corpus.words.words())
stopwords = nltk.corpus.stopwords.words('english')

In [6]:
def extract_features(frame):
    frame_features = []
    
#     for index, row in frame.iterrows():
#     for row in frame.itertuples(): # requires 'dot access' to columns
#     zip is faster than itertuples and a whole lot faster than iterrows
    for context, response in zip(frame['context'], frame['response']):
        features = []
        
        # context length and number of tokens in context are usually useless
        
        response_tokens = response.split(' ')
        context_tokens = context.split(' ')
    
        # feature: response length
        features.append(float(len(response)))

        # feature: number of tokens in the response
        features.append(float(len(response_tokens)))
        
        # features: number of words of length 1, 2, ..., 5
        # counters of length 6+ don't seem to have importance
        lens = Counter(map(len, response_tokens))
        for i in range(5):
            features.append(float(lens[i + 1]))
        
        # features: number of specific tokens from the list in the response
        tokens_to_count = ['<at>', '<number>', '!', '.']
        for token_to_count in tokens_to_count:
            features.append(float(response_tokens.count(token_to_count)))
            
        # features: includes apostrophes, is composed of english letters, is in vocabulary, is a stopword
        # for context and response
        apostrophe_counter = 0
        alpha_counter = 0
        vocab_counter = 0
        stop_counter = 0
        counters = [0] * 8
        for token in response_tokens:
            if "'" in token:
                counters[0] += 1
            if token.isalpha():
                counters[1] += 1
            if token in vocab:
                counters[2] += 1
            if token in stopwords:
                counters[3] += 1
        for token in context_tokens:
            if "'" in token:
                counters[4] += 1
            if token.isalpha():
                counters[5] += 1
            if token in vocab:
                counters[6] += 1
            if token in stopwords:
                counters[7] += 1
        for counter in counters:
            features.append(float(counter))
            
            
        all_tokens = set(context_tokens + response_tokens)
        shared_tokens = set(context_tokens).intersection(response_tokens)

        # feature: number of shared tokens between context and response
        features.append(float(len(shared_tokens)))

        # feature: sum of abs diffs in token shares
        diff = 0
        for token in all_tokens:
            token_stat = abs(float(context_tokens.count(token)) / len(context_tokens) -
                         float(response_tokens.count(token)) / len(response_tokens))
            diff += token_stat
        features.append(diff)
        
        frame_features.append(features)

    return frame_features

In [7]:
def first_frame(print_filename):
    for frame in pd.read_csv(print_filename, chunksize = chunksize, delimiter = '\t'):
        frame = frame.replace({'@@ ': ''}, regex = True)
        break
    return frame

# Exploration

### Printing

In [145]:
frame = first_frame(train_filename)

In [146]:
frame.shape

(50000, 4)

In [150]:
for i in range(50):
    print('Human' if frame.iloc[i]['human-generated'] == 1 else 'Generated')
    print(frame.iloc[i].context)
    print('')
    print(frame.iloc[i].response)
    print('=====')

Generated
<first_speaker> sounds like many of my aie peeps had a crappy day ! hugs to all

<second_speaker> <at> i 'll post sex so might fall asleep . have a good nights sleep too i can do it all day * sunbathing on the sofa instead screams *
=====
Generated
<first_speaker> posting a video soon about the new album . excited nervous to see what you kittens think . any guesses to what the album name is ?

<second_speaker> <at> what is it ?
=====
Human
<first_speaker> so there 's a mouse at work .

<second_speaker> <at> wtf it betta b the <number> w <number> legs & a ponytail ! not <number> legs w a long tail ! coughing umm yea can 't make it in <number> day !
=====
Generated
<first_speaker> in new york concrete jungle wet dream tomato . <second_speaker> <at> go to bed la boy .

<first_speaker> <at> shit seems like that many drinks are in the gym ! ! !
=====
Human
<first_speaker> official event hashtag is soslam <at> <at>

<second_speaker> <at> <at> i 'm all over it ! at dulles , making m

### Feature extraction profiling

In [6]:
%load_ext line_profiler

In [7]:
all_features_list = []
all_labels_list = []
short_frame = frame[['context', 'response']]

In [18]:
%lprun -f extract_features extract_features(short_frame)

### Class balance

In [60]:
def write_counters(frame):
    global human_counter, generated_counter
    labels = frame['human-generated'].values
    labels_sum = labels.sum()
    human_counter += labels_sum
    generated_counter += frame.shape[0] - labels_sum

In [61]:
human_counter = 0
generated_counter = 0
run(write_counters, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 9s	Chunk 15 over: 13s	Chunk 20 over: 16s	Chunk 25 over: 21s	Chunk 30 over: 25s	Chunk 35 over: 29s	Chunk 40 over: 33s	Chunk 45 over: 37s	Chunk 50 over: 41s	Chunk 55 over: 45s	Chunk 60 over: 49s	Chunk 65 over: 53s	Chunk 70 over: 57s	Chunk 75 over: 61s	Chunk 80 over: 65s	Chunk 85 over: 68s	Chunk 90 over: 72s	Chunk 95 over: 76s	Chunk 100 over: 81s	Chunk 105 over: 85s	Chunk 110 over: 89s	Chunk 115 over: 92s	Chunk 120 over: 97s	Chunk 125 over: 101s	Chunk 130 over: 111s	Chunk 135 over: 124s	Chunk 140 over: 138s	
144 chunks overall


In [62]:
print(human_counter, generated_counter)

3595488 3595488


### Tags

In [53]:
def write_tags(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        tag_list = list(filter(lambda token: token[0] == '<' and token[-1] == '>', token_list))
        tag_set = set(tag_list)
        for tag in tag_set:
            tag_count = tag_list.count(tag)
            if labels[i] == 0:
                tag_dict_generated[tag]['occ'] += 1
                tag_dict_generated[tag][tag_count] += 1
                tag_dict_generated[tag]['sum'] += tag_count
            else:
                tag_dict_human[tag]['occ'] += 1
                tag_dict_human[tag][tag_count] += 1
                tag_dict_human[tag]['sum'] += tag_count

In [55]:
tag_dict_human = defaultdict(lambda: defaultdict(int))
tag_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_tags, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 18s	Chunk 10 over: 28s	Chunk 15 over: 34s	Chunk 20 over: 40s	Chunk 25 over: 46s	Chunk 30 over: 52s	Chunk 35 over: 59s	Chunk 40 over: 65s	Chunk 45 over: 71s	Chunk 50 over: 77s	Chunk 55 over: 84s	Chunk 60 over: 90s	Chunk 65 over: 96s	Chunk 70 over: 102s	Chunk 75 over: 108s	Chunk 80 over: 114s	Chunk 85 over: 120s	Chunk 90 over: 126s	Chunk 95 over: 133s	Chunk 100 over: 148s	Chunk 105 over: 164s	Chunk 110 over: 179s	Chunk 115 over: 192s	Chunk 120 over: 205s	Chunk 125 over: 219s	Chunk 130 over: 233s	Chunk 135 over: 246s	Chunk 140 over: 259s	
144 chunks overall


Tag statistics saved in analysis_results/tag_analysis.txt

In [110]:
# for token_to_count in tokens_to_count:
#     print('trying ' + token_to_count)
#     model = train(train_filename)
#     auc = estimate_auc(model)
#     print('AUC ' + str(round(auc, 4)))

### Punctuation

In [66]:
def write_punct(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        punct_list = list(filter(lambda token: len(token) == 1 and token in string.punctuation, token_list))
        punct_set = set(punct_list)
        for punct in punct_set:
            punct_count = punct_list.count(punct)
            if labels[i] == 0:
                punct_dict_generated[punct]['occ'] += 1
                punct_dict_generated[punct][punct_count] += 1
                punct_dict_generated[punct]['sum'] += punct_count
            else:
                punct_dict_human[punct]['occ'] += 1
                punct_dict_human[punct][punct_count] += 1
                punct_dict_human[punct]['sum'] += punct_count

In [68]:
punct_dict_human = defaultdict(lambda: defaultdict(int))
punct_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_punct, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 12s	Chunk 15 over: 19s	Chunk 20 over: 25s	Chunk 25 over: 31s	Chunk 30 over: 38s	Chunk 35 over: 45s	Chunk 40 over: 52s	Chunk 45 over: 58s	Chunk 50 over: 65s	Chunk 55 over: 71s	Chunk 60 over: 77s	Chunk 65 over: 82s	Chunk 70 over: 89s	Chunk 75 over: 95s	Chunk 80 over: 101s	Chunk 85 over: 107s	Chunk 90 over: 113s	Chunk 95 over: 119s	Chunk 100 over: 125s	Chunk 105 over: 131s	Chunk 110 over: 140s	Chunk 115 over: 156s	Chunk 120 over: 167s	Chunk 125 over: 173s	Chunk 130 over: 179s	Chunk 135 over: 185s	Chunk 140 over: 191s	
144 chunks overall


Punctuation statistics saved in analysis_results/punc_analysis.txt

# Training

### Feature extraction for training

In [8]:
def write_features_labels(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    labels = frame['human-generated'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append
    all_labels_list.extend(labels)

In [9]:
def extract_all_features_labels(filename):
    global all_features_list, all_labels_list
    
    all_features_list = []
    all_labels_list = []
    run(write_features_labels, filename)
    all_features = np.array(all_features_list)
    all_labels = np.array(all_labels_list)
    
    return all_features, all_labels

### Training itself

In [10]:
def get_eval_data():
    frame = first_frame(eval_filename)
    
    short_frame = frame[['context', 'response']]
    features = np.array(extract_features(short_frame))
    
    truth = frame['human-generated'].values
    
    return (features, truth)

In [15]:
def rounded_auc(pred_param, labels_param): # custom auc with rounding for correct early stopping
    pred = 1.0 / (1.0 + np.exp(-pred_param))
    labels = labels_param.get_label()
    score = roc_auc_score(labels, pred)
    return ('auc', round(score, 3))

In [24]:
def train(all_features, all_labels, eval_features, eval_labels, early_stopping_rounds = 3):
    model = XGBClassifier(max_depth = 7, n_estimators = 200)
    model = model.fit(all_features, all_labels, eval_set = [(eval_features, eval_labels)],
                      eval_metric = rounded_auc, early_stopping_rounds = early_stopping_rounds)
    return model

### Fast AUC evaluation

In [28]:
def estimate_auc(model, eval_features, eval_labels):
    pred = model.predict(eval_features)
    return roc_auc_score(eval_labels, pred)

In [14]:
all_features, all_labels = extract_all_features_labels(train_filename)

Chunk 0 over: 27s	Chunk 5 over: 165s	Chunk 10 over: 304s	
14 chunks overall


In [17]:
pickle.dump(all_features, open('all_features.pickle.dat', 'wb'))
# model = pickle.load(open('44_features.pickle.dat', 'rb'))

In [18]:
pickle.dump(all_labels, open('all_labels.pickle.dat', 'wb'))
# model = pickle.load(open('44_features.pickle.dat', 'rb'))

In [None]:
eval_features, eval_labels = get_eval_data()

In [25]:
model = train(all_features, all_labels, eval_features, eval_labels)

[0]	validation_0-auc:0.707
Will train until validation_0-auc hasn't improved in 3 rounds.
[1]	validation_0-auc:0.714
[2]	validation_0-auc:0.715
[3]	validation_0-auc:0.718
[4]	validation_0-auc:0.719
[5]	validation_0-auc:0.72
[6]	validation_0-auc:0.722
[7]	validation_0-auc:0.724
[8]	validation_0-auc:0.725
[9]	validation_0-auc:0.727
[10]	validation_0-auc:0.728
[11]	validation_0-auc:0.729
[12]	validation_0-auc:0.731
[13]	validation_0-auc:0.731
[14]	validation_0-auc:0.732
[15]	validation_0-auc:0.733
[16]	validation_0-auc:0.734
[17]	validation_0-auc:0.734
[18]	validation_0-auc:0.735
[19]	validation_0-auc:0.735
[20]	validation_0-auc:0.736
[21]	validation_0-auc:0.736
[22]	validation_0-auc:0.737
[23]	validation_0-auc:0.737
[24]	validation_0-auc:0.738
[25]	validation_0-auc:0.738
[26]	validation_0-auc:0.738
[27]	validation_0-auc:0.739
[28]	validation_0-auc:0.739
[29]	validation_0-auc:0.739
[30]	validation_0-auc:0.739
Stopping. Best iteration:
[27]	validation_0-auc:0.739



In [26]:
pickle.dump(model, open('7_custom_early_stopping.pickle.dat', 'wb'))
# model = pickle.load(open('44_features.pickle.dat', 'rb'))

In [29]:
estimate_auc(model, eval_features, eval_labels)

0.66327135445737684

In [30]:
model.feature_importances_

array([ 0.13741936,  0.03032258,  0.08322581,  0.03774194,  0.0383871 ,
        0.0316129 ,  0.02129032,  0.01709677,  0.00580645,  0.04774193,
        0.03225806,  0.06709678,  0.03967742,  0.04387097,  0.06806452,
        0.00774194,  0.05258064,  0.04193548,  0.05645161,  0.06193548,
        0.07774194], dtype=float32)

# Inference

In [259]:
def extract_features_ids(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    ids = frame['id'].values.tolist()
    
    test_features_list.extend(features) # actually more efficient than numpy append()
    test_ids_list.extend(ids)

In [243]:
test_features_list = []
test_ids_list = []
run(extract_features_ids, test_filename)
test_features = np.array(test_features_list)
test_ids = np.array(test_ids_list)

Chunk 0 over: 32s	Chunk 5 over: 165s	Chunk 10 over: 277s	
11 chunks overall


In [244]:
predicted_labels = model.predict(test_features)

In [245]:
output = pd.DataFrame(np.concatenate(([test_ids], [predicted_labels]), axis = 0).T, columns = ['id', 'human-generated'])

In [246]:
output.to_csv('output.csv', index = False)