## Imports

In [1]:
import os

## Config

In [2]:
feature_list_id = 'jaccard_ngrams'

In [3]:
NGRAM_RANGE = range(2, 6)

In [4]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [5]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

In [6]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

## Build Features

In [7]:
def get_char_ngrams(doc, n):
    return [doc[i:i + n] for i in range(len(doc) - n + 1)]

In [8]:
def get_ngram_jaccard_similarities(q1_ngrams, q2_ngrams):
    len_intersection = len(q1_ngrams.intersection(q2_ngrams))
    jaccard_index = len_intersection / len(q1_ngrams.union(q2_ngrams))
    jaccard_index_norm_q1 = len_intersection / len(q1_ngrams)
    jaccard_index_norm_q2 = len_intersection / len(q2_ngrams)
    
    return jaccard_index, jaccard_index_norm_q1, jaccard_index_norm_q2

In [9]:
def get_jaccard_similarities(q1, q2, n):
    if len(q1) < max(NGRAM_RANGE) and len(q2) < max(NGRAM_RANGE):
        return 1, 1, 1
    if len(q1) < max(NGRAM_RANGE) or len(q2) < max(NGRAM_RANGE):
        return 0, 0, 0
    
    q1_ngrams = set(get_char_ngrams(q1, n))
    q2_ngrams = set(get_char_ngrams(q2, n))
    return get_ngram_jaccard_similarities(q1_ngrams, q2_ngrams)

In [10]:
def build_features(df_questions_original, questions_tokenized):
    num_pairs = len(questions_tokenized)
    
    num_jaccard_features = 3
    num_features = len(NGRAM_RANGE) * num_jaccard_features
    
    X = np.zeros((num_pairs, num_features), dtype=float)
    
    for index, pair in progressbar(enumerate(questions_tokenized), size=num_pairs):
        q1_text = ' '.join(pair['question1'])
        q2_text = ' '.join(pair['question2'])

        for ix, n in enumerate(NGRAM_RANGE):
            ix_start = ix * num_jaccard_features
            ix_end = (ix + 1) * num_jaccard_features
            X[index, ix_start:ix_end] = get_jaccard_similarities(q1_text, q2_text, n)
            
    return X

## Save feature names

In [11]:
feature_names = []

In [12]:
for n in NGRAM_RANGE:
    feature_names.append(f'jaccard_ix_{n}gram')
    feature_names.append(f'jaccard_ix_norm_q1_{n}gram')
    feature_names.append(f'jaccard_ix_norm_q2_{n}gram')

In [13]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save features

In [14]:
X_train = build_features(df_questions_train, question_tokens_train)

Widget Javascript not detected.  It may not be installed or enabled properly.


In [15]:
save(X_train, features_data_folder + f'X_train_{feature_list_id}.pickle')

In [16]:
X_test = build_features(df_questions_test, question_tokens_test)

Widget Javascript not detected.  It may not be installed or enabled properly.


In [17]:
save(X_test, features_data_folder + f'X_test_{feature_list_id}.pickle')