In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('data/train.csv', 
                       dtype={
                           'question1': np.str,
                           'question2': np.str
                       })
df_test = pd.read_csv('data/test.csv', 
                      dtype={
                          'question1': np.str,
                          'question2': np.str
                      })
df_train = df_train.fillna(' ')
df_test = df_test.fillna(' ')

In [2]:
qs = set(df_train['question1']) | set(df_train['question2']) | set(df_test['question1']) | set(df_test['question2'])
qs = list(qs)
q2id = dict(zip(qs, range(len(qs))))

In [3]:
df_train['qid1'] = df_train['question1'].apply(q2id.get)
df_train['qid2'] = df_train['question2'].apply(q2id.get)
df_test['qid1'] = df_test['question1'].apply(q2id.get)
df_test['qid2'] = df_test['question2'].apply(q2id.get)

In [4]:
from scipy.sparse import coo_matrix, csr_matrix

m = coo_matrix((np.ones(2 * (df_train.shape[0] + df_test.shape[0])),
                (pd.concat((df_train['qid1'], df_test['qid1'], df_train['qid2'], df_test['qid2']), axis=0).values,
                 pd.concat((df_train['qid2'], df_test['qid2'], df_train['qid1'], df_test['qid1']), axis=0).values)),
               shape=(len(qs), len(qs)))
m = csr_matrix(m)

In [5]:
from sklearn.preprocessing import normalize

def cosine_similarities(mat):
    col_normed_mat = normalize(mat.tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

def jaccard_similarities(mat):
    cols_sum = mat.getnnz(axis=0)
    ab = mat.T * mat

    # for rows
    aa = np.repeat(cols_sum, ab.getnnz(axis=0))
    # for columns
    bb = cols_sum[ab.indices]

    similarities = ab.copy()
    similarities.data /= (aa + bb - ab.data)

    return similarities

In [6]:
%%time
m_cos = cosine_similarities(m)
m_jac = jaccard_similarities(m)

CPU times: user 3.99 s, sys: 256 ms, total: 4.24 s
Wall time: 4.28 s




In [7]:
m_cos = m_cos.todok()
m_jac = m_jac.todok()

In [8]:
%%time
def devil_cos(row):
    return m_cos[row['qid1'], row['qid2']]

def devil_jaccard(row):
    return m_jac[row['qid1'], row['qid2']]

df_train['devil_cos'] = df_train.apply(devil_cos, axis=1, raw=True)
df_train['devil_jaccard'] = df_train.apply(devil_jaccard, axis=1, raw=True)
df_test['devil_cos'] = df_test.apply(devil_cos, axis=1, raw=True)
df_test['devil_jaccard'] = df_test.apply(devil_jaccard, axis=1, raw=True)

CPU times: user 3min 22s, sys: 316 ms, total: 3min 23s
Wall time: 3min 23s


In [9]:
df_train[['devil_cos', 'devil_jaccard']].to_csv("data/train_devil.csv", index=False)
df_test[['devil_cos', 'devil_jaccard']].to_csv("data/test_devil.csv", index=False)

In [10]:
inter = (set(df_train['question1']) | set(df_train['question2'])) & (set(df_test['question1']) | set(df_test['question2']))
len(inter)

112162