## Imports

In [1]:
import datetime
import os

## Config

In [2]:
RANDOM_SEED = 42

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [4]:
feature_lists = [
    'simple_summaries',
    'fuzzy',
    'tfidf_distances',
    'embedding_mean',
    'embedding_normalized_sum',
    'wmd',
    'oofp_nn_concat_dense_1',
]

In [5]:
X_train_separate = [
    load(features_data_folder + f'X_train_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
]

In [6]:
running_feature_count = 0

for feature_list_id, features in zip(feature_lists, X_train_separate):
    start_index = running_feature_count
    end_index = running_feature_count + features.shape[-1] - 1
    running_feature_count += features.shape[-1]
    
    print(f'{feature_list_id:30s}: {start_index:3d} - {end_index:3d}')

simple_summaries              :   0 -   8
fuzzy                         :   9 -  13
tfidf_distances               :  14 -  15
embedding_mean                :  16 -  18
embedding_normalized_sum      :  19 -  21
wmd                           :  22 -  22
oofp_nn_concat_dense_1        :  23 -  23


In [7]:
X_train = np.hstack(X_train_separate)

In [8]:
y_train = load(features_data_folder + 'y_train.pickle')

In [9]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)

X train: (404290, 24)
y train: (404290,)


## Annotate columns

In [10]:
column_names = [
    # simple_summaries
    'shorter_char_len_log', 'longer_char_len_log', 'shorter_token_len_log', 'longer_token_len_log',
    'char_len_diff_log', 'token_len_diff_log', 'char_len_ratio', 'token_len_ratio',
    'word_diff_ratio',

    # fuzzy
    'fuzzy_ratio', 'fuzzy_partial_ratio',
    'fuzzy_token_sort_ratio', 'fuzzy_token_set_ratio',
    'fuzzy_partial_token_sort_ratio',

    # tfidf_distances
    'tfidf_cosine', 'tfidf_euclidean',
    
    # embedding_mean
    'emb_mean_cosine', 'emb_mean_cityblock_log', 'emb_mean_euclidean',

    # embedding_normalized_sum
    'emb_norm_sum_cosine', 'emb_norm_sum_cityblock_log', 'emb_norm_sum_euclidean',    
    
    # wmd
    'wmd',
    
    # oofp_nn_concat_dense_1
    'oofp_nn_concat_dense_1',
]

### Train

In [11]:
df_X_train = pd.DataFrame(X_train, columns=column_names)

In [12]:
df_X_train['is_duplicate'] = y_train

In [13]:
df_X_train.to_csv(
    features_data_folder + 'X_train_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.8f',
)

### Test

In [14]:
X_test = np.hstack([
    load(features_data_folder + f'X_test_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
])

In [15]:
df_X_test = pd.DataFrame(X_test, columns=column_names)

In [16]:
df_X_test.to_csv(
    features_data_folder + 'X_test_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.8f',
)