## Imports

In [1]:
import datetime
import os

## Config

In [2]:
RANDOM_SEED = 42

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [4]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'tfidf_distances',
    'embedding_mean',
    'embedding_normalized_sum',
    'wmd',
    'wordnet_similarity',
    'dasolmar_whq',
    'magic_jturkewitz',
    'oofp_nn_concat_dense_1',
    'oofp_currie32_cnn',
    'oofp_lystdo_lstm',
]

In [5]:
X_train_separate = [
    load(features_data_folder + f'X_train_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
]

In [6]:
column_names = []
for feature_list_id in feature_lists:
    column_names.extend(load_lines(features_data_folder + f'X_train_{feature_list_id}.names'))

In [7]:
column_names

['shorter_char_len_log',
 'longer_char_len_log',
 'shorter_token_len_log',
 'longer_token_len_log',
 'char_len_diff_log',
 'token_len_diff_log',
 'char_len_ratio',
 'token_len_ratio',
 'word_diff_ratio',
 'jaccard_ix_2gram',
 'jaccard_ix_norm_q1_2gram',
 'jaccard_ix_norm_q2_2gram',
 'jaccard_ix_3gram',
 'jaccard_ix_norm_q1_3gram',
 'jaccard_ix_norm_q2_3gram',
 'jaccard_ix_4gram',
 'jaccard_ix_norm_q1_4gram',
 'jaccard_ix_norm_q2_4gram',
 'jaccard_ix_5gram',
 'jaccard_ix_norm_q1_5gram',
 'jaccard_ix_norm_q2_5gram',
 'fuzzy_ratio',
 'fuzzy_partial_ratio',
 'fuzzy_token_sort_ratio',
 'fuzzy_token_set_ratio',
 'fuzzy_partial_token_sort_ratio',
 'tfidf_cosine',
 'tfidf_euclidean',
 'emb_mean_cosine',
 'emb_mean_cityblock_log',
 'emb_mean_euclidean',
 'emb_norm_sum_cosine',
 'emb_norm_sum_cityblock_log',
 'emb_norm_sum_euclidean',
 'wmd',
 'wordnet_similarity_raw',
 'wordnet_similarity_brown',
 'das_word_match',
 'das_word_match_2root',
 'das_tfidf_word_match',
 'das_shared_count',
 'das_sto

In [8]:
running_feature_count = 0

for feature_list_id, features in zip(feature_lists, X_train_separate):
    start_index = running_feature_count
    end_index = running_feature_count + features.shape[-1] - 1
    running_feature_count += features.shape[-1]
    
    print(f'{feature_list_id:30s}: {start_index:3d} - {end_index:3d}')

simple_summaries              :   0 -   8
jaccard_ngrams                :   9 -  20
fuzzy                         :  21 -  25
tfidf_distances               :  26 -  27
embedding_mean                :  28 -  30
embedding_normalized_sum      :  31 -  33
wmd                           :  34 -  34
wordnet_similarity            :  35 -  36
dasolmar_whq                  :  37 -  82
magic_jturkewitz              :  83 -  85
oofp_nn_concat_dense_1        :  86 -  86
oofp_currie32_cnn             :  87 -  87
oofp_lystdo_lstm              :  88 -  88


In [9]:
X_train = np.hstack(X_train_separate)

In [10]:
y_train = load(features_data_folder + 'y_train.pickle')

In [11]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)

X train: (404290, 89)
y train: (404290,)


### Train

In [12]:
df_X_train = pd.DataFrame(X_train, columns=column_names)

In [13]:
df_X_train['is_duplicate'] = y_train

In [14]:
df_X_train.head()

Unnamed: 0,shorter_char_len_log,longer_char_len_log,shorter_token_len_log,longer_token_len_log,char_len_diff_log,token_len_diff_log,char_len_ratio,token_len_ratio,word_diff_ratio,jaccard_ix_2gram,jaccard_ix_norm_q1_2gram,jaccard_ix_norm_q2_2gram,jaccard_ix_3gram,jaccard_ix_norm_q1_3gram,jaccard_ix_norm_q2_3gram,jaccard_ix_4gram,jaccard_ix_norm_q1_4gram,jaccard_ix_norm_q2_4gram,jaccard_ix_5gram,jaccard_ix_norm_q1_5gram,jaccard_ix_norm_q2_5gram,fuzzy_ratio,fuzzy_partial_ratio,fuzzy_token_sort_ratio,fuzzy_token_set_ratio,fuzzy_partial_token_sort_ratio,tfidf_cosine,tfidf_euclidean,emb_mean_cosine,emb_mean_cityblock_log,emb_mean_euclidean,emb_norm_sum_cosine,emb_norm_sum_cityblock_log,emb_norm_sum_euclidean,wmd,wordnet_similarity_raw,wordnet_similarity_brown,das_word_match,das_word_match_2root,das_tfidf_word_match,das_shared_count,das_stops1_ratio,das_stops2_ratio,das_shared_2gram,das_cosine,das_words_hamming,das_diff_stops_r,das_len_q1,das_len_q2,das_diff_len,das_caps_count_q1,das_caps_count_q2,das_diff_caps,das_len_char_q1,das_len_char_q2,das_diff_len_char,das_len_word_q1,das_len_word_q2,das_diff_len_word,das_avg_word_len1,das_avg_word_len2,das_diff_avg_word,das_q1_how,das_q2_how,das_how_both,das_q1_what,das_q2_what,das_what_both,das_q1_which,das_q2_which,das_which_both,das_q1_who,das_q2_who,das_who_both,das_q1_where,das_q2_where,das_where_both,das_q1_when,das_q2_when,das_when_both,das_q1_why,das_q2_why,das_why_both,magic_jt_q1_freq,magic_jt_q2_freq,magic_jt_freq_ratio,oofp_nn_concat_dense_1,oofp_currie32_cnn,oofp_lystdo_lstm,is_duplicate
0,4.060443,4.204693,2.197225,2.302585,2.302585,0.693147,0.863636,0.888889,0.066667,0.878788,0.878788,1.0,0.829268,0.85,0.971429,0.813953,0.833333,0.972222,0.813953,0.833333,0.972222,0.94,0.96,0.93,1.0,0.92,0.021345,0.206614,0.025058,1.985271,0.459552,0.025058,1.399438,0.223864,0.557206,0.956546,0.929319,0.37256,0.610377,0.571429,4.0,1.0,1.2,0.416667,0.795192,0.785714,0.2,66.0,57.0,9.0,1.0,1.0,0.0,53.0,46.0,7.0,14.0,12.0,2.0,3.785714,3.833333,0.047619,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.5,0.158443,0.096349,0.183166,0
1,3.951244,4.488636,2.079442,2.484907,3.637586,1.609438,0.579545,0.636364,0.333333,0.5,0.931034,0.519231,0.4,0.83871,0.433333,0.362319,0.78125,0.403226,0.333333,0.727273,0.380952,0.72,0.83,0.67,0.91,0.86,0.229795,0.67793,0.07732,2.690113,0.975774,0.07732,1.864741,0.393243,2.110534,0.584377,0.736721,0.256523,0.506481,0.181818,2.0,1.0,0.333333,0.052632,0.410927,0.076923,0.666667,51.0,88.0,37.0,5.0,5.0,0.0,44.0,76.0,32.0,8.0,13.0,5.0,5.5,5.846154,0.346154,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,3.0,2.666667,0.166921,0.015903,0.010785,0
2,4.094345,4.304065,2.197225,2.302585,2.70805,0.693147,0.808219,0.888889,0.529412,0.415385,0.5625,0.613636,0.337838,0.471698,0.543478,0.298701,0.433962,0.489362,0.225,0.346154,0.391304,0.7,0.68,0.71,0.71,0.75,0.742052,1.218238,0.096765,2.657482,0.964824,0.096765,1.955503,0.439921,2.095801,0.443592,0.340505,0.171507,0.414134,0.222222,2.0,1.333333,1.0,0.045455,0.340883,0.142857,0.333333,73.0,59.0,14.0,5.0,5.0,0.0,60.0,50.0,10.0,14.0,10.0,4.0,4.285714,5.0,0.714286,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.720308,0.392852,0.1149,0
3,3.931826,4.189655,2.079442,2.772589,2.772589,2.197225,0.769231,0.466667,0.875,0.095238,0.2,0.153846,0.012195,0.030303,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.6,0.28,0.3,0.38,1.0,1.414214,0.460786,3.375753,2.034251,0.460786,2.661311,0.959985,3.840113,0.244038,0.217445,0.0,0.0,0.0,0.0,1.5,0.8,0.0,0.0,0.0,0.7,50.0,65.0,15.0,4.0,1.0,3.0,40.0,57.0,17.0,11.0,9.0,2.0,3.636364,6.333333,2.69697,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.00664,0.006851,0.000118,0
4,3.688879,4.343805,1.94591,2.564949,3.637586,1.94591,0.513158,0.5,0.555556,0.333333,0.375,0.75,0.192308,0.234375,0.517241,0.120482,0.153846,0.357143,0.083333,0.109375,0.259259,0.54,0.64,0.51,0.71,0.79,0.736555,1.213718,0.223193,3.115394,1.556873,0.223193,2.326733,0.668121,3.242191,0.554741,0.394289,0.0,0.0,0.0,0.0,0.3,0.4,0.0,0.0,0.076923,0.1,76.0,39.0,37.0,1.0,1.0,0.0,64.0,33.0,31.0,13.0,7.0,6.0,4.923077,4.714286,0.208791,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,3.0,0.006794,0.012832,0.012638,0


In [15]:
df_X_train.to_csv(
    features_data_folder + 'X_train_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.6f',
)

### Test

In [16]:
X_test = np.hstack([
    load(features_data_folder + f'X_test_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
])

In [17]:
df_X_test = pd.DataFrame(X_test, columns=column_names)

In [18]:
df_X_test.to_csv(
    features_data_folder + 'X_test_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.6f',
)