Data format
===

The data itself is already pre-processed (`<s>, </s>` tags, `<unk>` tag, etc.). The punctuation is tokenized (one symbol = one token, no word-punctuation merged tokens). The is only one space symbol between every two adjacent tokens.

UPD: In the data there are common combination "@@ ". This is because the data has been preprocessed with BPE encoding (see: https://arxiv.org/abs/1508.07909 and https://github.com/rsennrich/subword-nmt) in order to reduce the vocabulary size.

In order to properly print a message you should make sure to do the following in Python:

[your string message here].replace(‘@@ ‘, ‘’)

NOTE: replace `@@space` by nothing. Don’t forget the [space]!

Broadly speaking, BPE encoding will split words into the most common n-gram to reduce the vocabulary size. The ‘@@ ’ you see are tokens to indicate there was a split. Thus to print the actual word you should replace all occurrences of '@@' to nothing.


Data fields are separated by one tab character.

    context - context phrase(s) for response, always human generated
    response - one phrase or a few phrases, may be from different speakers
    human-generated - flag if the response is generated by human

Data header: 'id\tcontext\tresponse\thuman-generated\n'

Submission format
===

ROC AUC score.

The file should contain a header and have the following format:

id,human-generated

1,1

8,0

9,1

10,1

We expect the solution file to have 524,342 predictions.

In [2]:
import numpy as np
import pandas as pd
from time import time
from xgboost import XGBClassifier
from IPython.display import clear_output
import pickle
from sklearn.metrics import roc_auc_score
import string
from collections import defaultdict, Counter
import nltk
from tqdm import tqdm



In [66]:
nltk.download() # download all nltk data

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
chunksize = 50000
train_filename = 'data/sampled_train.txt' # shrunk by sample_train.sh to 10% of the size, random rows
full_train_filename = 'data/train.txt'
test_filename = 'data/test.txt'
eval_filename = 'data/eval.txt' # shrunk by sample_eval.sh to 50000 random rows

# Iteration and feature extraction

In [4]:
def run(function, filename, print_every = 5, chunks = None):
    # run function on every chunk of file
    start = time()
    for i, frame in enumerate(pd.read_csv(filename, chunksize = chunksize, delimiter = '\t')):
        frame = frame.replace({'@@ ': ''}, regex = True)
        
        function(frame)
        
        if i % print_every == 0:
            print('Chunk ' + str(i) + ' over: ' + str(round(time() - start)) + 's', end = '\t')
        if chunks is not None:
            if i >= chunks - 1:
                break
    print('\n' + str(i + 1) + ' chunks overall')

In [5]:
vocab = set(w.lower() for w in nltk.corpus.words.words())
stopwords = nltk.corpus.stopwords.words('english')
possible_tags = [',', '$', "''", '(', ')', ',', '--', '.', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR',
                'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS',
                'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
# syntactical tags

In [6]:
def extract_features(frame): # for a single chunk of data
    frame_features = []
    
#     for index, row in frame.iterrows():
#     for row in frame.itertuples(): # requires 'dot access' to columns
#     zip is faster than itertuples and a whole lot faster than iterrows
    for context, response in zip(frame['context'], frame['response']):
        features = []
        
        response_tokens = response.split(' ')
        context_tokens = context.split(' ')
    
        # feature: response length
        features.append(float(len(response)))

        # feature: number of tokens in the response
        features.append(float(len(response_tokens)))
    
        # feature: context length
        features.append(float(len(context)))

        # feature: number of tokens in the context
        features.append(float(len(context_tokens)))
        
        # features: number of words of length 1, 2, ..., 5 in response
        # counters of length 6+ don't seem to have importance
        lens = Counter(map(len, response_tokens))
        for i in range(5):
            features.append(float(lens[i + 1]))
        
        # features: number of words of length 1, 2, ..., 5 in context
        lens = Counter(map(len, context_tokens))
        for i in range(5):
            features.append(float(lens[i + 1]))
        
        # features: number of specific tokens from the list in the response
        tokens_to_count = ['<at>', '<number>', '!', '.']
        for token_to_count in tokens_to_count:
            features.append(float(response_tokens.count(token_to_count)))
            
        # feature: number of tweets in context
#         new_tweet_marks = ['<first_speaker>', '<second_speaker>', '<third_speaker>', '<minor_speaker>']
# first and third are unimportant so:
        new_tweet_marks = ['<second_speaker>', '<minor_speaker>']
        context_tweet_count = 0
        for mark in new_tweet_marks:
            context_tweet_count += context_tokens.count(mark)
        features.append(float(context_tweet_count))
            
        # features: includes apostrophes, is composed of english letters, is in vocabulary, is a stopword
        # for context and response
        counters = [0] * 8
        for token in response_tokens:
            if "'" in token:
                counters[0] += 1
            if token.isalpha():
                counters[1] += 1
            if token in vocab:
                counters[2] += 1
            if token in stopwords:
                counters[3] += 1
        for token in context_tokens:
            if "'" in token:
                counters[4] += 1
            if token.isalpha():
                counters[5] += 1
            if token in vocab:
                counters[6] += 1
            if token in stopwords:
                counters[7] += 1
        for counter in counters:
            features.append(float(counter))
            
        all_tokens = set(context_tokens + response_tokens)
        shared_tokens = set(context_tokens).intersection(response_tokens)

        # feature: number of shared tokens between context and response
        features.append(float(len(shared_tokens)))

        # feature: sum of abs diffs in token shares
        diff = 0
        for token in all_tokens:
            token_stat = abs(float(context_tokens.count(token)) / len(context_tokens) -
                         float(response_tokens.count(token)) / len(response_tokens))
            diff += token_stat
        features.append(diff)
        
#         # features: counts of syntactical tags (INEFFICIENT)
#         word_tokens = list(filter(lambda token: '<' not in token, response_tokens))
# #         print(word_tokens)
#         if len(word_tokens) > 0:
#             tags = list(zip(*nltk.pos_tag(word_tokens)))[1]
#         else:
#             tags = []
#         tag_features = [float(tags.count(tag)) for tag in possible_tags]
#         features.extend(tag_features)
        
        frame_features.append(features)

    return frame_features

In [7]:
def first_frame(print_filename): # get first chunk in a file as a dataframe for exploration
    for frame in pd.read_csv(print_filename, chunksize = chunksize, delimiter = '\t'):
        frame = frame.replace({'@@ ': ''}, regex = True)
        break
    return frame

# Exploration

### Printing

In [131]:
frame = first_frame(train_filename)

In [132]:
frame.shape

(50000, 4)

In [135]:
# short_frame = frame[['context', 'response']]
# f = extract_features(short_frame)

In [35]:
for i in range(500, 600):
    print('Human' if frame.iloc[i]['human-generated'] == 1 else 'Generated')
    print(frame.iloc[i].context)
    print('')
    print(frame.iloc[i].response)
    print('=====')

Human
<first_speaker> just bottled my first beer from a recipe by me . let 's see how it improves during the next weeks . but it tastes f * * ng amazing just now ! ! ! ! ! <second_speaker> <at> congrats ! and cheers ! and where 's the photo ?

<first_speaker> <at> i 'll upload it tomorrow . my iphone is out of battery
=====
Human
<first_speaker> i don 't even understand how people can dislike katie ?

<second_speaker> <at> blame xfactor they edited all videos to make katie look spoilt and favoured . as we all no she aint like that at all
=====
Human
<first_speaker> voice thread users . if i create multiple identities , give people access to my account , they can edit collaborate simultaneously right ? <second_speaker> <at> yes they can edit , simultaneously ? not sure yet . my students have their own free accounts and can edit a thread .

<first_speaker> <at> but they can 't co - create . i 'm doing this with a group of teachers this week and want them to build one together .
=====
Hum

In [91]:
# # treat apostrophes as one word
# apostrophe_tokens = set()
# for context, response in zip(frame['context'], frame['response']):
#     response_tokens = response.split(' ')
#     context_tokens = context.split(' ')
#     for token in response_tokens:
#         if "'" in token:
#             apostrophe_tokens.add(token)
#     for token in context_tokens:
#         if "'" in token:
#             apostrophe_tokens.add(token)

In [90]:
# apostrophe_tokens

In [102]:
# message = '<second_speaker> <at> faxes are about as relevant as mimeographs . with fewer purple stains on your new hypercolor shirt mom will still be pissed .'
# tokens = message.split(' ')

### Feature extraction profiling

In [6]:
%load_ext line_profiler

In [7]:
all_features_list = []
all_labels_list = []
short_frame = frame[['context', 'response']]

In [18]:
%lprun -f extract_features extract_features(short_frame)

### Class balance

In [60]:
def write_counters(frame):
    global human_counter, generated_counter
    labels = frame['human-generated'].values
    labels_sum = labels.sum()
    human_counter += labels_sum
    generated_counter += frame.shape[0] - labels_sum

In [61]:
human_counter = 0
generated_counter = 0
run(write_counters, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 9s	Chunk 15 over: 13s	Chunk 20 over: 16s	Chunk 25 over: 21s	Chunk 30 over: 25s	Chunk 35 over: 29s	Chunk 40 over: 33s	Chunk 45 over: 37s	Chunk 50 over: 41s	Chunk 55 over: 45s	Chunk 60 over: 49s	Chunk 65 over: 53s	Chunk 70 over: 57s	Chunk 75 over: 61s	Chunk 80 over: 65s	Chunk 85 over: 68s	Chunk 90 over: 72s	Chunk 95 over: 76s	Chunk 100 over: 81s	Chunk 105 over: 85s	Chunk 110 over: 89s	Chunk 115 over: 92s	Chunk 120 over: 97s	Chunk 125 over: 101s	Chunk 130 over: 111s	Chunk 135 over: 124s	Chunk 140 over: 138s	
144 chunks overall


In [62]:
print(human_counter, generated_counter)

3595488 3595488


### Tags

In [53]:
def write_tags(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        tag_list = list(filter(lambda token: token[0] == '<' and token[-1] == '>', token_list))
        tag_set = set(tag_list)
        for tag in tag_set:
            tag_count = tag_list.count(tag)
            if labels[i] == 0:
                tag_dict_generated[tag]['occ'] += 1
                tag_dict_generated[tag][tag_count] += 1
                tag_dict_generated[tag]['sum'] += tag_count
            else:
                tag_dict_human[tag]['occ'] += 1
                tag_dict_human[tag][tag_count] += 1
                tag_dict_human[tag]['sum'] += tag_count

In [55]:
tag_dict_human = defaultdict(lambda: defaultdict(int))
tag_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_tags, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 18s	Chunk 10 over: 28s	Chunk 15 over: 34s	Chunk 20 over: 40s	Chunk 25 over: 46s	Chunk 30 over: 52s	Chunk 35 over: 59s	Chunk 40 over: 65s	Chunk 45 over: 71s	Chunk 50 over: 77s	Chunk 55 over: 84s	Chunk 60 over: 90s	Chunk 65 over: 96s	Chunk 70 over: 102s	Chunk 75 over: 108s	Chunk 80 over: 114s	Chunk 85 over: 120s	Chunk 90 over: 126s	Chunk 95 over: 133s	Chunk 100 over: 148s	Chunk 105 over: 164s	Chunk 110 over: 179s	Chunk 115 over: 192s	Chunk 120 over: 205s	Chunk 125 over: 219s	Chunk 130 over: 233s	Chunk 135 over: 246s	Chunk 140 over: 259s	
144 chunks overall


Tag statistics saved in analysis_results/tag_analysis.txt

In [110]:
# for token_to_count in tokens_to_count:
#     print('trying ' + token_to_count)
#     model = train(train_filename)
#     auc = estimate_auc(model)
#     print('AUC ' + str(round(auc, 4)))

### Punctuation

In [66]:
def write_punct(frame):
    responses_frame = frame['response']
    
    token_lists = responses_frame.str.split(' ').values
    labels = frame['human-generated'].values
    
    for i, token_list in enumerate(token_lists):
        punct_list = list(filter(lambda token: len(token) == 1 and token in string.punctuation, token_list))
        punct_set = set(punct_list)
        for punct in punct_set:
            punct_count = punct_list.count(punct)
            if labels[i] == 0:
                punct_dict_generated[punct]['occ'] += 1
                punct_dict_generated[punct][punct_count] += 1
                punct_dict_generated[punct]['sum'] += punct_count
            else:
                punct_dict_human[punct]['occ'] += 1
                punct_dict_human[punct][punct_count] += 1
                punct_dict_human[punct]['sum'] += punct_count

In [68]:
punct_dict_human = defaultdict(lambda: defaultdict(int))
punct_dict_generated = defaultdict(lambda: defaultdict(int))
run(write_punct, full_train_filename)

Chunk 0 over: 1s	Chunk 5 over: 5s	Chunk 10 over: 12s	Chunk 15 over: 19s	Chunk 20 over: 25s	Chunk 25 over: 31s	Chunk 30 over: 38s	Chunk 35 over: 45s	Chunk 40 over: 52s	Chunk 45 over: 58s	Chunk 50 over: 65s	Chunk 55 over: 71s	Chunk 60 over: 77s	Chunk 65 over: 82s	Chunk 70 over: 89s	Chunk 75 over: 95s	Chunk 80 over: 101s	Chunk 85 over: 107s	Chunk 90 over: 113s	Chunk 95 over: 119s	Chunk 100 over: 125s	Chunk 105 over: 131s	Chunk 110 over: 140s	Chunk 115 over: 156s	Chunk 120 over: 167s	Chunk 125 over: 173s	Chunk 130 over: 179s	Chunk 135 over: 185s	Chunk 140 over: 191s	
144 chunks overall


Punctuation statistics saved in analysis_results/punc_analysis.txt

# Training

### Feature extraction for training

In [24]:
def write_features_labels(frame): # append features and labels of current frame to a general list
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    labels = frame['human-generated'].values.tolist()
    
    all_features_list.extend(features) # actually more efficient than numpy append
    all_labels_list.extend(labels)
    
#     print('current len(all_labels_list): ' + str(len(all_labels_list)))

In [9]:
def extract_all_features_labels(filename):
    global all_features_list, all_labels_list
    
    all_features_list = []
    all_labels_list = []
    run(write_features_labels, filename)
    all_features = np.array(all_features_list)
    all_labels = np.array(all_labels_list)
    
    return all_features, all_labels

### Training itself

In [10]:
def get_eval_data():
    frame = first_frame(eval_filename)
    
    short_frame = frame[['context', 'response']]
    features = np.array(extract_features(short_frame))
    
    truth = frame['human-generated'].values
    
    return (features, truth)

In [11]:
def rounded_auc(pred_param, labels_param): # custom auc with rounding for correct early stopping in xgboost
    pred = 1.0 / (1.0 + np.exp(-pred_param))
    labels = labels_param.get_label()
    score = roc_auc_score(labels, pred)
    return ('auc', round(score, 3))

In [12]:
def train(all_features, all_labels, eval_features, eval_labels, early_stopping_rounds = 3):
    model = XGBClassifier(max_depth = 7, n_estimators = 200)
    model = model.fit(all_features, all_labels, eval_set = [(eval_features, eval_labels)],
                      eval_metric = rounded_auc, early_stopping_rounds = early_stopping_rounds)
    return model

### Fast AUC evaluation

In [13]:
def estimate_auc(model, eval_features, eval_labels): # AUC on eval formed from train
    pred = model.predict(eval_features)
    return roc_auc_score(eval_labels, pred)

In [149]:
all_features, all_labels = extract_all_features_labels(train_filename)

current len(all_labels_list): 50000
Chunk 0 over: 77s	current len(all_labels_list): 100000
current len(all_labels_list): 150000
current len(all_labels_list): 200000
current len(all_labels_list): 250000
current len(all_labels_list): 300000
Chunk 5 over: 474s	current len(all_labels_list): 350000
current len(all_labels_list): 400000
current len(all_labels_list): 450000
current len(all_labels_list): 500000
current len(all_labels_list): 550000
Chunk 10 over: 885s	current len(all_labels_list): 600000
current len(all_labels_list): 650000
current len(all_labels_list): 700000

14 chunks overall


In [16]:
# pickle.dump(all_features, open('all_features.pickle.dat', 'wb'))
all_features = pickle.load(open('all_features.pickle.dat', 'rb'))

In [17]:
# pickle.dump(all_labels, open('all_labels.pickle.dat', 'wb'))
all_labels = pickle.load(open('all_labels.pickle.dat', 'rb'))

In [18]:
eval_features, eval_labels = get_eval_data()

In [19]:
model = train(all_features, all_labels, eval_features, eval_labels)

[0]	validation_0-auc:0.711
Will train until validation_0-auc hasn't improved in 3 rounds.
[1]	validation_0-auc:0.72
[2]	validation_0-auc:0.721
[3]	validation_0-auc:0.725
[4]	validation_0-auc:0.727
[5]	validation_0-auc:0.728
[6]	validation_0-auc:0.73
[7]	validation_0-auc:0.733
[8]	validation_0-auc:0.734
[9]	validation_0-auc:0.736
[10]	validation_0-auc:0.737
[11]	validation_0-auc:0.739
[12]	validation_0-auc:0.74
[13]	validation_0-auc:0.741
[14]	validation_0-auc:0.742
[15]	validation_0-auc:0.742
[16]	validation_0-auc:0.743
[17]	validation_0-auc:0.744
[18]	validation_0-auc:0.744
[19]	validation_0-auc:0.745
[20]	validation_0-auc:0.746
[21]	validation_0-auc:0.747
[22]	validation_0-auc:0.748
[23]	validation_0-auc:0.748
[24]	validation_0-auc:0.749
[25]	validation_0-auc:0.75
[26]	validation_0-auc:0.75
[27]	validation_0-auc:0.75
[28]	validation_0-auc:0.751
[29]	validation_0-auc:0.751
[30]	validation_0-auc:0.752
[31]	validation_0-auc:0.752
[32]	validation_0-auc:0.753
[33]	validation_0-auc:0.753
[

In [20]:
pickle.dump(model, open('9_syntactical_features.pickle.dat', 'wb'))
# model = pickle.load(open('.pickle.dat', 'rb'))

In [21]:
estimate_auc(model, eval_features, eval_labels)

0.683648918967105

In [22]:
model.feature_importances_

array([ 0.10119048,  0.01785714,  0.02734375,  0.01171875,  0.04464286,
        0.02641369,  0.01971726,  0.01841518,  0.01897321,  0.01134673,
        0.00985863,  0.01153274,  0.01153274,  0.00855655,  0.01785714,
        0.0109747 ,  0.02176339,  0.01655506,  0.01469494,  0.03627232,
        0.02511161,  0.02548363,  0.03869048,  0.00409226,  0.01227679,
        0.01785714,  0.02604167,  0.0593378 ,  0.06194196,  0.00483631,
        0.        ,  0.00874256,  0.        ,  0.        ,  0.        ,
        0.        ,  0.01636905,  0.0046503 ,  0.00427827,  0.01134673,
        0.00018601,  0.00074405,  0.01488095,  0.01171875,  0.00111607,
        0.0014881 ,  0.        ,  0.00167411,  0.0249256 ,  0.00111607,
        0.        ,  0.00818452,  0.00037202,  0.00427827,  0.01915923,
        0.0031622 ,  0.01357887,  0.        ,  0.00055804,  0.00093006,
        0.00018601,  0.01748512,  0.00037202,  0.01227679,  0.00483631,
        0.0187872 ,  0.00223214,  0.03943452,  0.00446429,  0.00

# Inference

In [25]:
def extract_features_ids(frame):
    short_frame = frame[['context', 'response']]
    
    features = extract_features(short_frame)
    ids = frame['id'].values.tolist()
    
    test_features_list.extend(features) # actually more efficient than numpy append()
    test_ids_list.extend(ids)

In [None]:
test_features_list = []
test_ids_list = []
run(extract_features_ids, test_filename)
test_features = np.array(test_features_list)
test_ids = np.array(test_ids_list)

Chunk 0 over: 80s	Chunk 5 over: 472s	

In [None]:
predicted_labels = model.predict(test_features)

In [None]:
output = pd.DataFrame(np.concatenate(([test_ids], [predicted_labels]), axis = 0).T, columns = ['id', 'human-generated'])

In [None]:
output.to_csv('output.csv', index = False)