## Imports

In [None]:
import os

In [None]:
from gensim.models.wrappers.fasttext import FastText

In [None]:
from scipy.spatial.distance import cosine, euclidean, cityblock, jaccard

## Config

In [None]:
feature_list_id = 'embedding_mean'

In [None]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [None]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

In [None]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

In [None]:
embedding_model = FastText.load_word2vec_format(aux_data_folder + 'quora_filtered_no_stopwords.vec')

## Build Features

In [None]:
def build_features(questions_tokenized, embedding_model):
    num_pairs = len(questions_tokenized)
    word_vector_dim = len(embedding_model['apple'])
    num_features = 3
    
    X = np.zeros((num_pairs, num_features), dtype=float)
    
    for index, pair in progressbar(enumerate(questions_tokenized), size=num_pairs):
        q1_vectors = [embedding_model[token] for token in pair['question1'] if token in embedding_model.vocab]
        q2_vectors = [embedding_model[token] for token in pair['question2'] if token in embedding_model.vocab]

        if len(q1_vectors) == 0:
            q1_vectors.append(np.zeros(word_vector_dim))
        if len(q2_vectors) == 0:
            q2_vectors.append(np.zeros(word_vector_dim))
        
        q1_mean = np.mean(q1_vectors, axis=0)
        q2_mean = np.mean(q2_vectors, axis=0)

        # Cosine distance between average word vectors
        X[index, 0] = cosine(q1_mean, q2_mean)

        # Manhattan distance between average word vectors
        X[index, 1] = np.log(cityblock(q1_mean, q2_mean) + 1)

        # Euclidean distance between average word vectors
        X[index, 2] = euclidean(q1_mean, q2_mean)
    
    return X

## Save feature names

In [None]:
feature_names = [
    'emb_mean_cosine',
    'emb_mean_cityblock_log',
    'emb_mean_euclidean',
]

In [None]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save features

In [None]:
X_train = build_features(question_tokens_train, embedding_model)

In [None]:
save(X_train, features_data_folder + f'X_train_{feature_list_id}.pickle')

In [None]:
X_test = build_features(question_tokens_test, embedding_model)

In [None]:
save(X_test, features_data_folder + f'X_test_{feature_list_id}.pickle')