# Description

This model tries to infer if two questions are duplicates if the difference between the number of tokens of both questions is greater than a threshold.

With a threshold = 20, the result, for the training set is that 97% of questions pairs with difference greater than 20 are not duplicates.

## Links

* Creating your model: http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/
* GridSearchCV logging execution: http://stackoverflow.com/questions/24121018/sklearn-gridsearch-how-to-print-out-progress-during-the-execution
* Python Machine Learning: https://www.amazon.com/Python-Machine-Learning-Sebastian-Raschka/dp/1783555130

# The model

In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin

class ThresholdClassifier(BaseEstimator, ClassifierMixin):  

    def __init__(self, threshold=0):
        self.threshold = threshold

        
    def fit(self, X, y=None):

        return self

    def predict(self, X, y=None):

        return (X > self.threshold).astype('int')

## Example

In [2]:
import numpy as np

classifier = ThresholdClassifier(threshold=10)

In [3]:
classifier.predict(np.asarray([1,2,3]))

array([0, 0, 0])

# Training

#### reading

In [4]:
import pandas as pd

train = pd.read_csv("../data/train.csv/train.csv", index_col='id', dtype={'is_duplicate': np.bool})

train = train.fillna(' ')

#### tokenizing

In [5]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")

def n_tokens(text):
    
    return len(tokenizer.tokenize(text))

def distance_n_tokens(row):
    
    return abs(n_tokens(row['question1']) - n_tokens(row['question2']))

train['dist_n_tokens'] = train.apply(distance_n_tokens, axis=1)

#### creating datasets

In [6]:
from sklearn.model_selection import train_test_split

X, y = train[['dist_n_tokens']], train['is_duplicate'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

#### GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([('tc', ThresholdClassifier())])

param_grid = [{'tc__threshold': [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 100, 200]}]

gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10)

gs.fit(X_train, y_train)

print(gs.best_score_)

0.631240658226


In [12]:
print(gs.best_params_)

{'tc__threshold': 100}


# Running with test set

#### reading

In [40]:
import pandas as pd

test = pd.read_csv("../data/test.csv/test.csv", index_col='test_id', dtype={'is_duplicate': np.bool})

test = test.fillna(' ')

#### tokenizing

In [41]:
test['dist_n_tokens'] = test.apply(distance_n_tokens, axis=1)

#### creating the dataset

In [42]:
X = test[['dist_n_tokens']]

#### running the model

In [45]:
model = ThresholdClassifier(threshold=10)

y = model.predict(X.as_matrix())

In [46]:
pd.Series(y.ravel()).value_counts()

0    2116423
1     229373
dtype: int64

In [51]:
pd.DataFrame({'test_id': test.index, 'is_duplicate': y.ravel()}).to_csv("../submission/threshold_10.csv", index=False)