## Imports

In [None]:
import datetime
import os

## Config

In [None]:
RANDOM_SEED = 42

In [None]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [None]:
feature_lists = [
    'simple_summaries',
    'fuzzy',
    'tfidf_distances',
    'embedding_mean',
    'embedding_normalized_sum',
    'wmd',
    'wordnet_similarity',
    'oofp_nn_concat_dense_1',
]

In [None]:
X_train_separate = [
    load(features_data_folder + f'X_train_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
]

In [None]:
column_names = []
for feature_list_id in feature_lists:
    column_names.extend(load_lines(features_data_folder + f'X_train_{feature_list_id}.names'))

In [None]:
column_names

In [None]:
running_feature_count = 0

for feature_list_id, features in zip(feature_lists, X_train_separate):
    start_index = running_feature_count
    end_index = running_feature_count + features.shape[-1] - 1
    running_feature_count += features.shape[-1]
    
    print(f'{feature_list_id:30s}: {start_index:3d} - {end_index:3d}')

In [None]:
X_train = np.hstack(X_train_separate)

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

In [None]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)

### Train

In [None]:
df_X_train = pd.DataFrame(X_train, columns=column_names)

In [None]:
df_X_train['is_duplicate'] = y_train

In [None]:
df_X_train.head()

In [None]:
df_X_train.to_csv(
    features_data_folder + 'X_train_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.8f',
)

### Test

In [None]:
X_test = np.hstack([
    load(features_data_folder + f'X_test_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
])

In [None]:
df_X_test = pd.DataFrame(X_test, columns=column_names)

In [None]:
df_X_test.to_csv(
    features_data_folder + 'X_test_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.8f',
)