In [1]:
from __future__ import division

import nltk
import collections
import numpy as np
import pandas as pd
import xgboost as xgb



In [2]:
stops = set(nltk.corpus.stopwords.words('english'))

####################################################

# determine the number of matching words between question1 and question2 using a simple count and normalize

def word_match_simple_count ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    shared_words_in_question1 = [ word for word in question1_words.keys() if word in question2_words ]
    shared_words_in_question2 = [ word for word in question2_words.keys() if word in question1_words ]
    
    return ( len(shared_words_in_question1) + len(shared_words_in_question2) ) / \
           ( len(question1_words)           + len(question2_words)           )

####################################################

In [3]:
####################################################

# calculate a weight for each word

# If a word frequency is below the minimum count, we ignore the word
# smoothing reduces the impact of rare words

def get_word_weight ( count, smoothing, minimum_count ):

    if count < minimum_count:

        return 0
    
    else:

        return 1 / (count + smoothing)

####################################################

In [4]:
stops = set(nltk.corpus.stopwords.words("english"))

####################################################

# determine the number of matching words between question1 and question2 using a per word weight and normalize

def word_match_simple_weight ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0
    
    shared_weights = [ word_weights.get(word, 0) for word in question1_words.keys() if word in question2_words ] + \
                     [ word_weights.get(word, 0) for word in question2_words.keys() if word in question1_words ]
        
    total_weights  = [ word_weights.get(word, 0) for word in question1_words ] + \
                     [ word_weights.get(word, 0) for word in question2_words ]
    
    return np.sum( shared_weights ) / np.sum( total_weights )

####################################################

In [5]:
####################################################

print 'Load training and testing dataset'

training_data      = pd.read_csv( '/home/ubuntu/train.csv' )
testing_data       = pd.read_csv( '/home/ubuntu/test.csv' )

training_questions = pd.Series ( 
                                training_data['question1'].tolist() +
                                training_data['question2'].tolist() 
                               ).astype(str)

testing_questions  = pd.Series ( 
                                testing_data['question1'].tolist() +
                                testing_data['question2'].tolist() 
                               ).astype(str)

####################################################

print 'Calculate a weight for each word from the training and testing datasets'

word_count         = collections.defaultdict(int)

for question in training_questions:
    for word in question.lower().split():
        word_count[word] += 1

for question in testing_questions:
    for word in question.lower().split():
        word_count[word] += 1

word_weights       = {word : get_word_weight ( 
                                              count, 
                                              smoothing     = 10000, 
                                              minimum_count = 2
                                             ) for word, count in word_count.items()}

####################################################

Load training and testing dataset
Calculate a weight for each word from the training and testing datasets


In [6]:
from sklearn.cross_validation import train_test_split

####################################################

print 'Prepare training and testing data'

x_train                             = pd.DataFrame()
x_test                              = pd.DataFrame()

####################################################

x_train['word_match_simple_count']  = training_data.apply (
                                                           func = word_match_simple_count, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )

x_train['word_match_simple_weight'] = training_data.apply (
                                                           func = word_match_simple_weight, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )

####################################################

x_test['word_match_simple_count']   = testing_data.apply (
                                                          func = word_match_simple_count, 
                                                          axis = 1, 
                                                          raw  = True
                                                         )

x_test['word_match_simple_weight']  = testing_data.apply (
                                                           func = word_match_simple_weight, 
                                                           axis = 1, 
                                                           raw  = True
                                                          )

####################################################

y_train                             = training_data['is_duplicate'].values

####################################################

print 'Split the data for training'

x_train, x_valid, y_train, y_valid  = train_test_split (
                                                        x_train,
                                                        y_train, 
                                                        test_size    = 0.2, 
                                                        random_state = 4242
                                                       )
####################################################

print 'Convert data to XGB format'

data_train                          = xgb.DMatrix (
                                                   data  = x_train,  
                                                   label = y_train
                                                  )

data_validate                       = xgb.DMatrix (
                                                   data  = x_valid, 
                                                   label = y_valid
                                                  )

####################################################

data_test                           = xgb.DMatrix (
                                                   data  = x_test
                                                  )

####################################################

Prepare training and testing data




Split the data for training
Convert data to XGB format


In [7]:
####################################################

print 'Execute the XGBoost model'

XGB_parameters                                 = {}
XGB_parameters['objective']                    = 'binary:logistic'
XGB_parameters['eval_metric']                  = 'logloss'
XGB_parameters['eta']                          = 0.02
XGB_parameters['max_depth']                    = 4

XGB_watchlist                                  = [
                                                  (data_train,      'train'), 
                                                  (data_validate,   'valid')
                                                 ]

XGB_booster = xgb.train (
                         params                = XGB_parameters, 
                         dtrain                = data_train, 
                         num_boost_round       = 1000, 
                         evals                 = XGB_watchlist, 
                         early_stopping_rounds = 50, 
                         verbose_eval          = 10
                        )

####################################################

Execute the XGBoost model
[0]	train-logloss:0.686068	valid-logloss:0.68614
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.628352	valid-logloss:0.62904
[20]	train-logloss:0.587523	valid-logloss:0.588683
[30]	train-logloss:0.557667	valid-logloss:0.55919
[40]	train-logloss:0.535323	valid-logloss:0.537145
[50]	train-logloss:0.518344	valid-logloss:0.520413
[60]	train-logloss:0.505285	valid-logloss:0.507558
[70]	train-logloss:0.495187	valid-logloss:0.497627
[80]	train-logloss:0.487309	valid-logloss:0.489892
[90]	train-logloss:0.481158	valid-logloss:0.483873
[100]	train-logloss:0.476305	valid-logloss:0.479135
[110]	train-logloss:0.472419	valid-logloss:0.475329
[120]	train-logloss:0.469358	valid-logloss:0.472327
[130]	train-logloss:0.466893	valid-logloss:0.469915
[140]	train-logloss:0.464902	valid-logloss:0.467964
[150]	train-logloss:0.463318	valid-logloss:0.466407
[160]

In [8]:
####################################################

print 'Make predictions and create submission file'

predictions                    = XGB_booster.predict( data_test )

submission                     = pd.DataFrame()

submission['test_id']          = testing_data['test_id']
submission['is_duplicate']     = predictions

submission.to_csv (
                   path_or_buf = 'XGBOOST_submission_to_kaggle.csv', 
                   index       = False
                  )

####################################################

Make predictions and create submission file
