# Description

These features are based in the difference between the number of some classes of tokens between the questions.

# Distance of numbers of tokens

#### functions

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

tokenizer = RegexpTokenizer("[\w']+")
stops = stopwords.words("english")

def tokenize(row):
    
    return (tokenizer.tokenize(row['question1']), tokenizer.tokenize(row['question2']))

def remove_stops(tokens):
    
    non_stop_tokens_0 = [token for token in tokens[0] if token not in stops]
    non_stop_tokens_1 = [token for token in tokens[1] if token not in stops]
    
    return (non_stop_tokens_0, non_stop_tokens_1)

def n_tokens_non_stop(tokens):
    
    return (len(remove_stops(tokens[0])), len(remove_stopstokens[1]))

def n_common_tokens(tokens):
    
    return len(set(tokens[0]).intersection(tokens[1]))

def distance(tokens):
    
    return abs(len(tokens[0]) - len(tokens[1]))

def analyze_synsets(tokens):
    
    all_synsets_0 = set().union(*[wordnet.synsets(token) for token in tokens[0]])
    all_synsets_1 = set().union(*[wordnet.synsets(token) for token in tokens[1]])
    
    return (all_synsets_0, all_synsets_1)

def n_common_synsets(synsets):
    
    return len(set(synsets[0]).intersection(synsets[1]))

In [2]:
def make_features(row):
    
    tokens = tokenize(row)
    non_stop_tokens = remove_stops(tokens)
    synsets = analyze_synsets(non_stop_tokens)
    
    distance_all_tokens = distance(tokens)
    distance_non_stops_tokens = distance(non_stop_tokens)
    distance_common_tokens = n_common_tokens(tokens)
    distance_common_non_stops_tokens = n_common_tokens(non_stop_tokens)
    distance_common_synsets = n_common_synsets(synsets)
    
    features = {
        'distance_all_tokens': distance_all_tokens,
        'distance_non_stops_tokens': distance_non_stops_tokens,
        'distance_common_tokens': distance_common_tokens,
        'distance_common_non_stops_tokens': distance_common_non_stops_tokens,
        'distance_common_synsets': distance_common_synsets,
        'id': row.name
    }
    
    return pd.Series(features)

#### dataset

In [3]:
import pandas as pd

train = pd.read_csv("../data/train.csv/train.csv", index_col='id', dtype={'is_duplicate': 'bool'})

train = train.fillna(' ')

train.shape

In [5]:
features = train.apply(make_features, axis=1)

# Training

#### creating datasets

In [6]:
from sklearn.model_selection import train_test_split

X, y = features, train['is_duplicate'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

#### GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('rfc', RandomForestClassifier(random_state=1))])


param_grid = [{'rfc__max_depth': [1, 5],
               'rfc__n_estimators': [10, 20]}]

gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='neg_log_loss',
                  cv=5)

gs.fit(X_train, y_train)

print(gs.best_score_)

-0.559718097651


In [8]:
print(gs.best_params_)

{'rfc__n_estimators': 10, 'rfc__max_depth': 5}


#### Training

In [9]:
model = gs.best_estimator_

model.fit(X, y)

Pipeline(steps=[('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False))])

In [10]:
pd.Series(model.predict(X)).value_counts()

0    233210
1    171080
dtype: int64

# Running with test set

#### reading

In [11]:
import pandas as pd

test = pd.read_csv("../data/test.csv/test.csv", index_col='test_id', dtype={'is_duplicate': 'bool'})

test = test.fillna(' ')

#### features

In [12]:
%%time

features = test.apply(make_features, axis=1)

Wall time: 29min 35s


#### running the model

In [13]:
y = model.predict(features)

In [14]:
pd.Series(y.ravel()).value_counts()

0    1736094
1     609702
dtype: int64

In [15]:
pd.DataFrame({'test_id': test.index, 'is_duplicate': y.ravel()}).to_csv("../submission/all.csv.gz", index=False, compression='gzip')