## Imports

In [None]:
import os

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, euclidean

## Config

In [None]:
feature_list_id = 'tfidf_distances'

In [None]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [None]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

In [None]:
all_unique_questions = set(' '.join(pair['question1']) for pair in question_tokens_train) \
    .union(set(' '.join(pair['question2']) for pair in question_tokens_train)) \
    .union(set(' '.join(pair['question1']) for pair in question_tokens_test)) \
    .union(set(' '.join(pair['question2']) for pair in question_tokens_test))

## Train TF-IDF Vectorizer

In [None]:
documents = list(all_unique_questions)

In [None]:
save_lines(documents, preproc_data_folder + 'unique_documents.txt')

In [None]:
vectorizer = TfidfVectorizer(
    encoding='utf-8',
    analyzer='word',
    strip_accents='unicode',
    ngram_range=(1, 1),
    lowercase=True,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
)

In [None]:
vectorizer.fit(documents)

In [None]:
save(vectorizer, preproc_data_folder + 'unique_questions_all_tfidf_vectorizer.pickle'.format())

## Vectorize Train and Test Sets, Compute Distances

In [None]:
num_pairs_train = len(question_tokens_train)
num_pairs_test = len(question_tokens_test)

In [None]:
X_train = np.zeros((num_pairs_train, 2), dtype=float)
X_test = np.zeros((num_pairs_test, 2), dtype=float)

In [None]:
print('X train:', X_train.shape)
print('X test: ', X_test.shape)

In [None]:
for index, pair in progressbar(enumerate(question_tokens_train), size=num_pairs_train):
    pair_documents = [' '.join(pair['question1']), ' '.join(pair['question2'])]
    pair_dtm = vectorizer.transform(pair_documents)
    q1_doc_vec = pair_dtm[0, :].toarray()
    q2_doc_vec = pair_dtm[1, :].toarray()
    
    X_train[index, 0] = cosine(q1_doc_vec, q2_doc_vec)
    X_train[index, 1] = euclidean(q1_doc_vec, q2_doc_vec)

In [None]:
save(X_train, features_data_folder + 'X_train_tfidf_distances.pickle')

In [None]:
for index, pair in progressbar(enumerate(question_tokens_test), size=num_pairs_test):
    pair_documents = [' '.join(pair['question1']), ' '.join(pair['question2'])]
    pair_dtm = vectorizer.transform(pair_documents)
    q1_doc_vec = pair_dtm[0, :].toarray()
    q2_doc_vec = pair_dtm[1, :].toarray()
    
    X_test[index, 0] = cosine(q1_doc_vec, q2_doc_vec)
    X_test[index, 1] = euclidean(q1_doc_vec, q2_doc_vec)

In [None]:
save(X_test, features_data_folder + 'X_test_tfidf_distances.pickle')

## Save feature names

In [None]:
feature_names = [
    'tfidf_cosine',
    'tfidf_euclidean',
]

In [None]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')