## Imports

In [None]:
from joblib import Parallel, delayed

In [None]:
from scipy.spatial.distance import cosine, euclidean, cityblock

In [None]:
from scipy.sparse import csr_matrix, dok_matrix

In [None]:
from sklearn.decomposition import TruncatedSVD

## Config

In [None]:
feature_list_id = 'magic_stas_svd'

## Read Data

In [None]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

## Build Features

### Extract hashes

In [None]:
train_orig = df_questions_train
test_orig = df_questions_test

In [None]:
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns={'question2': 'question1'}, inplace=True)
df2_test.rename(columns={'question2': 'question1'}, inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
train_questions.drop_duplicates(subset=['question1'], inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values, index=train_questions.question1.values).to_dict()

In [None]:
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1', 'qid2'], axis=1, inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id': 'id'}, inplace=True)
comb = pd.concat([train_cp, test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

# Map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: q1_vc.get(x, 0) + q2_vc.get(x, 0))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: q1_vc.get(x, 0) + q2_vc.get(x, 0))

### Build cooccurrence matrix

In [None]:
saved_matrix_path = aux_data_folder + 'magic_stas_avito_cooccurrence_csr.pickle'

In [None]:
if os.path.exists(saved_matrix_path):
    cooccurrence_csr = load(saved_matrix_path)
else:
    cooccurrence = dok_matrix((len(questions_dict), len(questions_dict)), dtype='b')
    for i, row in progressbar(comb.iterrows(), size=len(comb)):
        cooccurrence[row['q1_hash'], row['q2_hash']] = 1
        cooccurrence[row['q2_hash'], row['q1_hash']] = 1
    cooccurrence_csr = cooccurrence.tocsr(copy=True)
    save(cooccurrence_csr, saved_matrix_path)

### Decompose the cooccurrence matrix

In [None]:
svd_dim = 150

In [None]:
svd = TruncatedSVD(n_components=svd_dim, n_iter=30, random_state=42)

In [None]:
X_svd = svd.fit_transform(cooccurrence_csr)

In [None]:
X_svd.shape

In [None]:
plt.plot(np.cumsum(svd.explained_variance_ratio_))

### Compute distances between question pairs

In [None]:
hash_pairs = comb[['q1_hash', 'q2_hash']].values.tolist()

In [None]:
def cosine_distance(hash_pair):
    return cosine(X_svd[hash_pair[0]], X_svd[hash_pair[1]])

In [None]:
def euclidean_distance(hash_pair):
    return euclidean(X_svd[hash_pair[0]], X_svd[hash_pair[1]])

In [None]:
def cityblock_distance(hash_pair):
    return cityblock(X_svd[hash_pair[0]], X_svd[hash_pair[1]])

In [None]:
cosine_distances = Parallel(n_jobs=-1)(
    delayed(cosine_distance)(pair)
    for pair in progressbar(hash_pairs, size=len(hash_pairs), every=500)
)

In [None]:
euclidean_distances = Parallel(n_jobs=-1)(
    delayed(euclidean_distance)(pair)
    for pair in progressbar(hash_pairs, size=len(hash_pairs), every=500)
)

In [None]:
cityblock_distances = Parallel(n_jobs=-1)(
    delayed(cityblock_distance)(pair)
    for pair in progressbar(hash_pairs, size=len(hash_pairs), every=500)
)

In [None]:
pd.DataFrame(cosine_distances).plot.hist()

In [None]:
pd.DataFrame(euclidean_distances).plot.hist()

In [None]:
pd.DataFrame(cityblock_distances).plot.hist()

In [None]:
X = np.vstack([cosine_distances, euclidean_distances, cityblock_distances]).T

In [None]:
X.shape

## Save feature names

In [None]:
feature_names = [
    f'magic_stas_svd_{svd_dim}_cosine',
    f'magic_stas_svd_{svd_dim}_euclidean',
    f'magic_stas_svd_{svd_dim}_cityblock',
]

In [None]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}_{svd_dim}.names')

## Save features

In [None]:
save(X[:len(df_questions_train)], features_data_folder + f'X_train_magic_stas_svd_{svd_dim}.pickle')

In [None]:
save(X[len(df_questions_train):], features_data_folder + f'X_test_magic_stas_svd_{svd_dim}.pickle')