## Imports

In [1]:
from scipy.spatial.distance import cosine, euclidean, mahalanobis

In [2]:
from scipy.sparse import csr_matrix, dok_matrix

## Config

In [29]:
feature_list_id = 'magic_stas_avito'

## Read Data

In [4]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

## Build Features

Extract hashes

In [5]:
train_orig = df_questions_train
test_orig = df_questions_test

In [6]:
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()

In [7]:
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: q1_vc.get(x, 0) + q2_vc.get(x, 0))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: q1_vc.get(x, 0) + q2_vc.get(x, 0))

Build cooccurrence matrix

In [8]:
saved_matrix_path = aux_data_folder + 'magic_stas_avito_cooccurrence_csr.pickle'

In [9]:
if os.path.exists(saved_matrix_path):
    cooccurrence_csr = load(saved_matrix_path)
else:
    cooccurrence = dok_matrix((len(questions_dict), len(questions_dict)), dtype='b')
    for i, row in progressbar(comb.iterrows(), size=len(comb)):
        cooccurrence[row['q1_hash'], row['q2_hash']] = 1
        cooccurrence[row['q2_hash'], row['q1_hash']] = 1
    cooccurrence_csr = cooccurrence.tocsr(copy=True)
    save(cooccurrence_csr, saved_matrix_path)

Extract distance features

In [10]:
from joblib import Parallel, delayed

In [11]:
hash_pairs = comb[['q1_hash', 'q2_hash']].values.tolist()

In [12]:
def cosine_distance(hash_pair):
    return cosine(
        cooccurrence_csr.getrow(hash_pair[0]).todense(),
        cooccurrence_csr.getrow(hash_pair[1]).todense()
    )

In [13]:
def euclidean_distance(hash_pair):
    return euclidean(
        cooccurrence_csr.getrow(hash_pair[0]).todense(),
        cooccurrence_csr.getrow(hash_pair[1]).todense()
    )

In [14]:
euclidean_distances = Parallel(n_jobs=4)(
    delayed(euclidean_distance)(pair)
    for pair in progressbar(hash_pairs, size=len(hash_pairs), every=500)
)

In [21]:
save(euclidean_distances, aux_data_folder + 'magic_stas_euclidean.pickle')

In [22]:
X = np.array(euclidean_distances).reshape(-1, 1)

In [23]:
X_train = X[:len(df_questions_train)]
X_test = X[len(df_questions_train):]

## Save feature names

In [24]:
feature_names = [
#     'magic_stas_avito_cosine',
    'magic_stas_avito_euclidean',
]

In [25]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save features

In [26]:
save(X_train, features_data_folder + f'X_train_{feature_list_id}.pickle')

In [27]:
save(X_test, features_data_folder + f'X_test_{feature_list_id}.pickle')