## Imports

In [1]:
import os

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, euclidean

## Config

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [4]:
question_tokens_train = load_json(preproc_data_folder + 'question_tokens_train.json')
question_tokens_test = load_json(preproc_data_folder + 'question_tokens_test.json')

In [5]:
all_unique_questions = set(' '.join(pair['question1']) for pair in question_tokens_train) \
    .union(set(' '.join(pair['question2']) for pair in question_tokens_train)) \
    .union(set(' '.join(pair['question1']) for pair in question_tokens_test)) \
    .union(set(' '.join(pair['question2']) for pair in question_tokens_test))

## Train TF-IDF Vectorizer

In [6]:
documents = list(all_unique_questions)

In [7]:
save_lines(documents, preproc_data_folder + 'unique_documents.txt')

In [8]:
vectorizer = TfidfVectorizer(
    encoding='utf-8',
    analyzer='word',
    strip_accents='unicode',
    ngram_range=(1, 1),
    lowercase=True,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
)

In [9]:
vectorizer.fit(documents)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
save(vectorizer, preproc_data_folder + 'unique_questions_all_tfidf_vectorizer.pickle'.format())

## Vectorize Train and Test Sets, Compute Distances

In [11]:
num_pairs_train = len(question_tokens_train)
num_pairs_test = len(question_tokens_test)

In [12]:
X_train = np.zeros((num_pairs_train, 2), dtype=float)
X_test = np.zeros((num_pairs_test, 2), dtype=float)

In [13]:
print('X train:', X_train.shape)
print('X test: ', X_test.shape)

X train: (404290, 2)
X test:  (2345796, 2)


In [14]:
for index, pair in progressbar(enumerate(question_tokens_train), size=num_pairs_train):
    pair_documents = [' '.join(pair['question1']), ' '.join(pair['question2'])]
    pair_dtm = vectorizer.transform(pair_documents)
    q1_doc_vec = pair_dtm[0, :].toarray()
    q2_doc_vec = pair_dtm[1, :].toarray()
    
    X_train[index, 0] = cosine(q1_doc_vec, q2_doc_vec)
    X_train[index, 1] = euclidean(q1_doc_vec, q2_doc_vec)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [15]:
save(X_train, features_data_folder + 'X_train_tfidf_distances.pickle')

In [16]:
for index, pair in progressbar(enumerate(question_tokens_test), size=num_pairs_test):
    pair_documents = [' '.join(pair['question1']), ' '.join(pair['question2'])]
    pair_dtm = vectorizer.transform(pair_documents)
    q1_doc_vec = pair_dtm[0, :].toarray()
    q2_doc_vec = pair_dtm[1, :].toarray()
    
    X_test[index, 0] = cosine(q1_doc_vec, q2_doc_vec)
    X_test[index, 1] = euclidean(q1_doc_vec, q2_doc_vec)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [17]:
save(X_test, features_data_folder + 'X_test_tfidf_distances.pickle')