In [1]:
####################################################

from __future__ import division

import math
import nltk
import scipy  # https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
              # https://machinelearning1.wordpress.com/2013/04/10/calculating-spatial-distance-metric-in-python/
import gensim
import decimal
import collections
import numpy as np
import pandas as pd
import xgboost as xgb

####################################################

Using TensorFlow backend.


In [2]:
stopwords = set(nltk.corpus.stopwords.words('english'))

####################################################

# convert sentences to vectors

def sent2vec( sentence ):
    
    words             = str( sentence ).lower().decode('utf-8')
    words             = nltk.word_tokenize( words )
    words             = [word for word in words if not word in stopwords]
    words             = [word for word in words if word.isalpha()]
    
    word_vector_list  = []

    for word in words:
        
        try:
            
            word_vector_list.append( word2vec[word] )
            
        except:
            
            continue
    
    
    word_vector_array = np.array( word_vector_list )
    
    if len(word_vector_list) == 0:
        sentence_vector = np.zeros(300)
    else:
        sentence_vector = word_vector_array.sum( axis=0 )
    
    return sentence_vector

####################################################

In [3]:
####################################################

def braycurtis_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.braycurtis (
                                              np.nan_to_num( question1_vector ), 
                                              np.nan_to_num( question2_vector )
                                             )

####################################################

In [4]:
####################################################

def canberra_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.canberra (
                                            np.nan_to_num( question1_vector ), 
                                            np.nan_to_num( question2_vector )
                                           )

####################################################

In [5]:
####################################################

def chebyshev_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.chebyshev (
                                             np.nan_to_num( question1_vector ), 
                                             np.nan_to_num( question2_vector )
                                            )

####################################################

In [6]:
####################################################

def cityblock_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.cityblock (
                                             np.nan_to_num( question1_vector ), 
                                             np.nan_to_num( question2_vector )
                                            )

####################################################

In [7]:
####################################################

def correlation_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.correlation (
                                               np.nan_to_num( question1_vector ), 
                                               np.nan_to_num( question2_vector )
                                              )

####################################################

In [8]:
####################################################

def cosine_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.cosine (
                                          np.nan_to_num( question1_vector ), 
                                          np.nan_to_num( question2_vector )
                                         )

####################################################

In [9]:
####################################################

def euclidean_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.euclidean (
                                             np.nan_to_num( question1_vector ), 
                                             np.nan_to_num( question2_vector )
                                            )

####################################################

In [10]:
####################################################

def hamming_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.hamming (
                                           np.nan_to_num( question1_vector ), 
                                           np.nan_to_num( question2_vector )
                                          )

####################################################

In [11]:
####################################################

def jaccard_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.jaccard (
                                           np.nan_to_num( question1_vector ), 
                                           np.nan_to_num( question2_vector )
                                          )

####################################################

In [12]:
####################################################

def matching_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.matching (
                                            np.nan_to_num( question1_vector ), 
                                            np.nan_to_num( question2_vector )
                                           )

####################################################

In [13]:
####################################################

def minkowski_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.minkowski (
                                             np.nan_to_num( question1_vector ), 
                                             np.nan_to_num( question2_vector ),
                                             3
                                            )

####################################################

In [14]:
####################################################

def russellrao_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.russellrao (
                                              np.nan_to_num( question1_vector ), 
                                              np.nan_to_num( question2_vector )
                                             )

####################################################

In [15]:
####################################################

def sqeuclidean_distance ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.spatial.distance.sqeuclidean (
                                               np.nan_to_num( question1_vector ), 
                                               np.nan_to_num( question2_vector )
                                              )

####################################################

In [16]:
stopwords = set(nltk.corpus.stopwords.words('english'))

####################################################

def wordmovers_distance ( row ):

    question1 = str(row['question1']).lower().split()
    question2 = str(row['question2']).lower().split()
    
    question1 = [word for word in question1 if word not in stopwords]
    question2 = [word for word in question2 if word not in stopwords]

    return word2vec.wmdistance(question1, question2)

####################################################

In [17]:
stopwords = set(nltk.corpus.stopwords.words('english'))

####################################################

def wordmovers_normalized_distance ( row ):

    question1 = str(row['question1']).lower().split()
    question2 = str(row['question2']).lower().split()
    
    question1 = [word for word in question1 if word not in stopwords]
    question2 = [word for word in question2 if word not in stopwords]

    return word2vec_normalized.wmdistance(question1, question2)

####################################################

In [18]:
####################################################

def question1_skew ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    
    return scipy.stats.skew(np.nan_to_num( question1_vector ))

####################################################

In [19]:
####################################################

def question2_skew ( row ):

    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.stats.skew(np.nan_to_num( question2_vector ))

####################################################

In [20]:
####################################################

def question1_kurtosis ( row ):

    question1_vector = question1_vector_lookup[row['question1']]
    
    return scipy.stats.kurtosis(np.nan_to_num( question1_vector ))

####################################################

In [21]:
####################################################

def question2_kurtosis ( row ):

    question2_vector = question2_vector_lookup[row['question2']]
    
    return scipy.stats.kurtosis(np.nan_to_num( question2_vector ))

####################################################

In [22]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_qratio ( row ):
    return fuzz.QRatio(str(row['question1']), str(row['question2']))

####################################################

In [23]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_WRatio ( row ):
    return fuzz.WRatio(str(row['question1']), str(row['question2']))

####################################################

In [24]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_partial_ratio ( row ):
    return fuzz.partial_ratio(str(row['question1']), str(row['question2']))

####################################################

In [25]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_partial_token_set_ratio ( row ):
    return fuzz.partial_token_set_ratio(str(row['question1']), str(row['question2']))

####################################################

In [26]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_partial_token_sort_ratio ( row ):
    return fuzz.partial_token_sort_ratio(str(row['question1']), str(row['question2']))

####################################################

In [27]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_token_set_ratio ( row ):
    return fuzz.token_set_ratio(str(row['question1']), str(row['question2']))

####################################################

In [28]:
from fuzzywuzzy import fuzz

####################################################

def fuzzy_token_sort_ratio ( row ):
    return fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))

####################################################


In [29]:
stops = set(nltk.corpus.stopwords.words('english'))

####################################################

# determine the number of matching words between question1 and question2 using a simple count and normalize

def word_match_simple_count ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    shared_words_in_question1 = [ word for word in question1_words.keys() if word in question2_words ]
    shared_words_in_question2 = [ word for word in question2_words.keys() if word in question1_words ]
    
    return ( len(shared_words_in_question1) + len(shared_words_in_question2) ) / \
           ( len(question1_words)           + len(question2_words)           )

####################################################

In [30]:
####################################################

# calculate a weight for each word

# If a word frequency is below the minimum count, we ignore the word
# smoothing reduces the impact of rare words

def get_word_weight ( count, smoothing, minimum_count ):

    if count < minimum_count:

        return 0
    
    else:

        return 1 / (count + smoothing)

####################################################

In [31]:
stops = set(nltk.corpus.stopwords.words("english"))

####################################################

# determine the number of matching words between question1 and question2 using a per word weight and normalize

def word_match_simple_weight ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0
    
    shared_weights = [ word_weights.get(word, 0) for word in question1_words.keys() if word in question2_words ] + \
                     [ word_weights.get(word, 0) for word in question2_words.keys() if word in question1_words ]
        
    total_weights  = [ word_weights.get(word, 0) for word in question1_words ] + \
                     [ word_weights.get(word, 0) for word in question2_words ]
    
    return np.sum( shared_weights ) / np.sum( total_weights )

####################################################

In [32]:
####################################################

print 'Load training and testing dataset'

training_data       = pd.read_csv(                                     '/home/ubuntu/mini_train.csv' )
testing_data        = pd.read_csv(                                     '/home/ubuntu/mini_test.csv' )

word2vec            = gensim.models.KeyedVectors.load_word2vec_format (
                                                                    '/home/ubuntu/GoogleNews-vectors-negative300.bin', 
                                                                    binary = True
                                                                      )

word2vec_normalized = gensim.models.KeyedVectors.load_word2vec_format (
                                                                    '/home/ubuntu/GoogleNews-vectors-negative300.bin', 
                                                                    binary = True
                                                                      )
word2vec_normalized.init_sims(replace=True)

training_questions  = pd.Series ( 
                                 training_data['question1'].tolist() +
                                 training_data['question2'].tolist() 
                                ).astype(str)

testing_questions   = pd.Series ( 
                                 testing_data['question1'].tolist() +
                                 testing_data['question2'].tolist() 
                                ).astype(str)

question1_vector_lookup = {}
question2_vector_lookup = {}

count = 0
for index, row in training_data.iterrows():
    question1_vector_lookup[row['question1']] = sent2vec(row['question1'])
    question2_vector_lookup[row['question2']] = sent2vec(row['question2'])
    count += 1
    if count % 100000 == 0:
        print 'training: ' + str(count)

count = 0
for index, row in testing_data.iterrows():
    question1_vector_lookup[row['question1']] = sent2vec(row['question1'])
    question2_vector_lookup[row['question2']] = sent2vec(row['question2'])
    count += 1
    if count % 100000 == 0:
        print 'testing: ' + str(count)

####################################################

print 'Calculate a weight for each word from the training and testing datasets'

word_count         = collections.defaultdict(int)

for question in training_questions:
    for word in question.lower().split():
        word_count[word] += 1

for question in testing_questions:
    for word in question.lower().split():
        word_count[word] += 1

word_weights        = {word : get_word_weight ( 
                                               count, 
                                               smoothing     = 10000, 
                                               minimum_count = 2
                                              ) for word, count in word_count.items()}

####################################################

Load training and testing dataset
Calculate a weight for each word from the training and testing datasets


In [33]:
from sklearn.cross_validation import train_test_split

####################################################

print 'Prepare training and testing data'

x_train                             = pd.DataFrame()
x_test                              = pd.DataFrame()

####################################################

x_train['word_match_simple_count']  = training_data.apply (
                                                           func = word_match_simple_count, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - word_match_simple_count'

x_train['word_match_simple_weight'] = training_data.apply (
                                                           func = word_match_simple_weight, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - word_match_simple_weight'

x_train['braycurtis_distance']      = training_data.apply (
                                                           func = braycurtis_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - braycurtis_distance'

x_train['canberra_distance']        = training_data.apply (
                                                           func = canberra_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - canberra_distance'

x_train['chebyshev_distance']       = training_data.apply (
                                                           func = chebyshev_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - chebyshev_distance'

x_train['cityblock_distance']       = training_data.apply (
                                                           func = cityblock_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - cityblock_distance'

x_train['correlation_distance']     = training_data.apply (
                                                           func = correlation_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - correlation_distance'

x_train['cosine_distance']          = training_data.apply (
                                                           func = cosine_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - cosine_distance'

x_train['euclidean_distance']       = training_data.apply (
                                                           func = euclidean_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - euclidean_distance'

x_train['hamming_distance']         = training_data.apply (
                                                           func = hamming_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - hamming_distance'

x_train['jaccard_distance']         = training_data.apply (
                                                           func = jaccard_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - jaccard_distance'

x_train['matching_distance']        = training_data.apply (
                                                           func = matching_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - matching_distance'

x_train['minkowski_distance']       = training_data.apply (
                                                           func = minkowski_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - minkowski_distance'

x_train['russellrao_distance']      = training_data.apply (
                                                           func = russellrao_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - russellrao_distance'

x_train['sqeuclidean_distance']     = training_data.apply (
                                                           func = sqeuclidean_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - sqeuclidean_distance'

x_train['wordmovers_distance']     = training_data.apply (
                                                           func = wordmovers_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - wordmovers_distance'

x_train['wordmovers_normalized_distance'] = training_data.apply (
                                                           func = wordmovers_normalized_distance, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )
print 'train - wordmovers_normalized_distance'

x_train['length_question1']               = training_data.question1.apply(lambda x: len(str(x)))
x_train['length_question2']               = training_data.question2.apply(lambda x: len(str(x)))
x_train['length_difference']              = x_train['length_question1'] - x_train['length_question2']
x_train['number_characters_question1']    = training_data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_train['number_characters_question2']    = training_data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_train['number_words_question1']         = training_data.question1.apply(lambda x: len(str(x).split()))
x_train['number_words_question2']         = training_data.question2.apply(lambda x: len(str(x).split()))
x_train['common_words']                   = training_data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
print 'train - basic features'

x_train['skew_question1_vector']          = training_data.apply (
                                                                 func = question1_skew,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - skew_question1_vector'

x_train['skew_question2_vector']          = training_data.apply (
                                                                 func = question2_skew,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - skew_question2_vector'

x_train['kurtosis_question1_vector']      = training_data.apply (
                                                                 func = question1_kurtosis,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - kurtosis_question1_vector'

x_train['kurtosis_question2_vector']      = training_data.apply (
                                                                 func = question2_kurtosis,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - kurtosis_question2_vector'

x_train['fuzzy_qratio']                   = training_data.apply (
                                                                 func = fuzzy_qratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_qratio'

x_train['fuzzy_WRatio']                   = training_data.apply (
                                                                 func = fuzzy_WRatio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_WRatio'

x_train['fuzzy_partial_ratio']            = training_data.apply (
                                                                 func = fuzzy_partial_ratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_partial_ratio'

x_train['fuzzy_partial_token_set_ratio']  = training_data.apply (
                                                                 func = fuzzy_partial_token_set_ratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_partial_token_set_ratio'

x_train['fuzzy_partial_token_sort_ratio'] = training_data.apply (
                                                                 func = fuzzy_partial_token_sort_ratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_partial_token_sort_ratio'

x_train['fuzzy_token_set_ratio']          = training_data.apply (
                                                                 func = fuzzy_token_set_ratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_token_set_ratio'

x_train['fuzzy_token_sort_ratio']         = training_data.apply (
                                                                 func = fuzzy_token_sort_ratio,
                                                                 axis = 1, 
                                                                 raw  = True
                                                                )
print 'train - fuzzy_token_sort_ratio'

####################################################

x_test['word_match_simple_count']  = testing_data.apply (
                                                         func = word_match_simple_count, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - word_match_simple_count'

x_test['word_match_simple_weight'] = testing_data.apply (
                                                         func = word_match_simple_weight, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - word_match_simple_weight'

x_test['braycurtis_distance']      = testing_data.apply (
                                                         func = braycurtis_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - braycurtis_distance'

x_test['canberra_distance']        = testing_data.apply (
                                                         func = canberra_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - canberra_distance'

x_test['chebyshev_distance']       = testing_data.apply (
                                                         func = chebyshev_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - chebyshev_distance'

x_test['cityblock_distance']       = testing_data.apply (
                                                         func = cityblock_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - cityblock_distance'

x_test['correlation_distance']     = testing_data.apply (
                                                         func = correlation_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - correlation_distance'

x_test['cosine_distance']          = testing_data.apply (
                                                         func = cosine_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - cosine_distance'

x_test['euclidean_distance']       = testing_data.apply (
                                                         func = euclidean_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - euclidean_distance'

x_test['hamming_distance']         = testing_data.apply (
                                                         func = hamming_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - hamming_distance'

x_test['jaccard_distance']         = testing_data.apply (
                                                         func = jaccard_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - jaccard_distance'

x_test['matching_distance']        = testing_data.apply (
                                                         func = matching_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - matching_distance'

x_test['minkowski_distance']       = testing_data.apply (
                                                         func = minkowski_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - minkowski_distance'

x_test['russellrao_distance']      = testing_data.apply (
                                                         func = russellrao_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - russellrao_distance'

x_test['sqeuclidean_distance']     = testing_data.apply (
                                                         func = sqeuclidean_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - sqeuclidean_distance'

x_test['wordmovers_distance']      = testing_data.apply (
                                                         func = wordmovers_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - wordmovers_distance'

x_test['wordmovers_normalized_distance'] = testing_data.apply (
                                                         func = wordmovers_normalized_distance, 
                                                         axis = 1, 
                                                         raw  = True
                                                        )
print 'test - wordmovers_normalized_distance'

x_test['length_question1']               = testing_data.question1.apply(lambda x: len(str(x)))
x_test['length_question2']               = testing_data.question2.apply(lambda x: len(str(x)))
x_test['length_difference']              = x_test['length_question1'] - x_train['length_question2']
x_test['number_characters_question1']    = testing_data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_test['number_characters_question2']    = testing_data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_test['number_words_question1']         = testing_data.question1.apply(lambda x: len(str(x).split()))
x_test['number_words_question2']         = testing_data.question2.apply(lambda x: len(str(x).split()))
x_test['common_words']                   = testing_data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
print 'test - basic features'

x_test['skew_question1_vector']          = testing_data.apply (
                                                               func = question1_skew,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - skew_question1_vector'

x_test['skew_question2_vector']          = testing_data.apply (
                                                               func = question2_skew,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - skew_question2_vector'

x_test['kurtosis_question1_vector']      = testing_data.apply (
                                                               func = question1_kurtosis,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - kurtosis_question1_vector'

x_test['kurtosis_question2_vector']      = testing_data.apply (
                                                               func = question2_kurtosis,
                                                               axis = 1, 
                                                               raw  = True
                                                              )

print 'test - kurtosis_question2_vector'

x_test['fuzzy_qratio']                   = testing_data.apply (
                                                               func = fuzzy_qratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_qratio'

x_test['fuzzy_WRatio']                   = testing_data.apply (
                                                               func = fuzzy_WRatio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_WRatio'

x_test['fuzzy_partial_ratio']            = testing_data.apply (
                                                               func = fuzzy_partial_ratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_partial_ratio'

x_test['fuzzy_partial_token_set_ratio']  = testing_data.apply (
                                                               func = fuzzy_partial_token_set_ratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_partial_token_set_ratio'

x_test['fuzzy_partial_token_sort_ratio'] = testing_data.apply (
                                                               func = fuzzy_partial_token_sort_ratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_partial_token_sort_ratio'

x_test['fuzzy_token_set_ratio']          = testing_data.apply (
                                                               func = fuzzy_token_set_ratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_token_set_ratio'

x_test['fuzzy_token_sort_ratio']         = testing_data.apply (
                                                               func = fuzzy_token_sort_ratio,
                                                               axis = 1, 
                                                               raw  = True
                                                              )
print 'test - fuzzy_token_sort_ratio'

####################################################

y_train                             = training_data['is_duplicate'].values

####################################################

print 'Split the data for training'

x_train, x_valid, y_train, y_valid  = train_test_split (
                                                        x_train,
                                                        y_train, 
                                                        test_size    = 0.2, 
                                                        random_state = 4242
                                                       )

####################################################

print 'Convert data to XGB format'

data_train                          = xgb.DMatrix (
                                                   data  = x_train,  
                                                   label = y_train
                                                  )

data_validate                       = xgb.DMatrix (
                                                   data  = x_valid, 
                                                   label = y_valid
                                                  )

####################################################

data_test                           = xgb.DMatrix (
                                                   data  = x_test
                                                  )

####################################################

Prepare training and testing data
train - word_match_simple_count
train - word_match_simple_weight
train - braycurtis_distance
train - canberra_distance
train - chebyshev_distance
train - cityblock_distance
train - correlation_distance
train - cosine_distance
train - euclidean_distance
train - hamming_distance
train - jaccard_distance
train - matching_distance
train - minkowski_distance
train - russellrao_distance
train - sqeuclidean_distance
train - wordmovers_distance
train - wordmovers_normalized_distance
train - basic features
train - skew_question1_vector
train - skew_question2_vector
train - kurtosis_question1_vector
train - kurtosis_question2_vector
train - fuzzy_qratio
train - fuzzy_WRatio
train - fuzzy_partial_ratio
train - fuzzy_partial_token_set_ratio
train - fuzzy_partial_token_sort_ratio
train - fuzzy_token_set_ratio
train - fuzzy_token_sort_ratio
test - word_match_simple_count
test - word_match_simple_weight
test - braycurtis_distance
test - canberra_distance
test - cheby

  dist = 1.0 - np.dot(um, vm) / (norm(um) * norm(vm))
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


test - jaccard_distance
test - matching_distance
test - minkowski_distance
test - russellrao_distance
test - sqeuclidean_distance
test - wordmovers_distance
test - wordmovers_normalized_distance
test - basic features
test - skew_question1_vector
test - skew_question2_vector
test - kurtosis_question1_vector
test - kurtosis_question2_vector
test - fuzzy_qratio
test - fuzzy_WRatio
test - fuzzy_partial_ratio
test - fuzzy_partial_token_set_ratio
test - fuzzy_partial_token_sort_ratio
test - fuzzy_token_set_ratio
test - fuzzy_token_sort_ratio
Split the data for training
Convert data to XGB format


In [34]:
print x_test.head(5)

   word_match_simple_count  word_match_simple_weight  braycurtis_distance  \
0                 0.266667                  0.500031             0.490320   
1                 0.500000                  0.600026             0.250493   
2                 0.444444                  0.571713             0.346363   
3                 0.000000                  0.000000             0.503057   
4                 0.800000                  1.000000             0.000000   

   canberra_distance  chebyshev_distance  cityblock_distance  \
0         165.240416            1.488525          114.431046   
1         116.091678            0.649414           53.327644   
2         132.997571            0.693359           62.257957   
3         171.692092            0.619873           54.527912   
4           0.000000            0.000000            0.000000   

   correlation_distance  cosine_distance  euclidean_distance  \
0          3.972893e-01     3.954516e-01            8.468845   
1          1.090954e-01 

In [35]:
####################################################

print 'Execute the XGBoost model'

XGB_parameters                                 = {}
XGB_parameters['objective']                    = 'binary:logistic'
XGB_parameters['eval_metric']                  = 'logloss'
XGB_parameters['eta']                          = 0.02
XGB_parameters['max_depth']                    = 4

XGB_watchlist                                  = [
                                                  (data_train,      'train'), 
                                                  (data_validate,   'valid')
                                                 ]

XGB_booster = xgb.train (
                         params                = XGB_parameters, 
                         dtrain                = data_train, 
                         num_boost_round       = 1000, 
                         evals                 = XGB_watchlist, 
                         early_stopping_rounds = 50, 
                         verbose_eval          = 10
                        )

####################################################

Execute the XGBoost model
[0]	train-logloss:0.68077	valid-logloss:0.687372
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.576782	valid-logloss:0.645439
[20]	train-logloss:0.495736	valid-logloss:0.627704
[30]	train-logloss:0.434344	valid-logloss:0.62306
[40]	train-logloss:0.386656	valid-logloss:0.618616
[50]	train-logloss:0.346757	valid-logloss:0.617644
[60]	train-logloss:0.31217	valid-logloss:0.621845
[70]	train-logloss:0.284605	valid-logloss:0.619451
[80]	train-logloss:0.259496	valid-logloss:0.621326
[90]	train-logloss:0.237657	valid-logloss:0.626243
Stopping. Best iteration:
[48]	train-logloss:0.354595	valid-logloss:0.615391



In [36]:
####################################################

print 'Make predictions and create submission file'

predictions                    = XGB_booster.predict( data_test )

submission                     = pd.DataFrame()

submission['test_id']          = testing_data['test_id']
submission['is_duplicate']     = predictions

submission.to_csv (
                   path_or_buf = 'XGBOOST_submission_to_kaggle.csv', 
                   index       = False
                  )

####################################################

Make predictions and create submission file
