## Imports

In [None]:
import os

## Config

In [None]:
feature_list_id = 'simple_summaries'

In [None]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [None]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

In [None]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

## Build Features

In [None]:
def word_share_ratio(q1_tokens, q2_tokens):
    return 2 * len(set(q1_tokens) & set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))

In [None]:
def word_difference_ratio(q1_tokens, q2_tokens):
    return len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))

In [None]:
def build_features(df_questions_original, questions_tokenized):
    num_pairs = len(questions_tokenized)
    num_features = 9
    
    X = np.zeros((num_pairs, num_features), dtype=float)
    
    for index, pair in progressbar(enumerate(questions_tokenized), size=num_pairs):
        q1_original = df_questions_original.ix[pair['id']].question1
        q2_original = df_questions_original.ix[pair['id']].question2

        shorter_char_length = min(len(q1_original), len(q2_original))
        longer_char_length = max(len(q1_original), len(q2_original))
        
        shorter_token_length = min(len(pair['question1']), len(pair['question2']))
        longer_token_length = max(len(pair['question1']), len(pair['question2']))
        
        # Length of questions (in characters)
        X[index, 0] = np.log(shorter_char_length + 1)
        X[index, 1] = np.log(longer_char_length + 1)

        # Length of questions (in tokens)
        X[index, 2] = np.log(shorter_token_length + 1)
        X[index, 3] = np.log(longer_token_length + 1)

        # Difference of question length (in characters)
        X[index, 4] = np.log(abs(longer_char_length - shorter_char_length) + 1)
        
        # Difference of question length (in tokens)
        X[index, 5] = np.log(abs(longer_token_length - shorter_token_length) + 1)

        # Ratio of question lengths (in characters)
        X[index, 6] = shorter_char_length / longer_char_length
        
        # Ratio of question lengths (in tokens)
        X[index, 7] = shorter_token_length / longer_token_length
        
        # Word difference ratio for question tokens.
        X[index, 8] = word_difference_ratio(pair['question1'], pair['question2'])
        
    return X

## Save feature names

In [None]:
feature_names = [
    'shorter_char_len_log',
    'longer_char_len_log',
    'shorter_token_len_log',
    'longer_token_len_log',
    'char_len_diff_log',
    'token_len_diff_log',
    'char_len_ratio',
    'token_len_ratio',
    'word_diff_ratio',
]

In [None]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save features

In [None]:
X_train = build_features(df_questions_train, question_tokens_train)

In [None]:
save(X_train, features_data_folder + f'X_train_{feature_list_id}.pickle')

In [None]:
X_test = build_features(df_questions_test, question_tokens_test)

In [None]:
save(X_test, features_data_folder + f'X_test_{feature_list_id}.pickle')