In [1]:
%matplotlib inline

In [2]:
from kaggle_quora_question_pairs_common import *

dataset.hdf
sample_submission.csv
sample_submission.csv.zip
test.csv
test.csv.zip
train.csv
train.csv.zip





In [3]:
%%time
train_df, test_df = load_train_test()
unique_questions = get_unique_questions(train_df, test_df, include_test=True)

CPU times: user 6.27 s, sys: 320 ms, total: 6.59 s
Wall time: 6.94 s


In [91]:
def tokenize_text(text):
    return text.str.replace('?', '').replace('.', '').str.replace('\W', ' ').str.split()

In [4]:
%%time
tokenized_questions = unique_questions.str.replace('?', '').replace('.', '').str.replace('\W', ' ').str.split()

CPU times: user 28.9 s, sys: 964 ms, total: 29.9 s
Wall time: 29.6 s


In [5]:
def train_word2vec(
    tokenized_questions,
    pre_trained_model='GoogleNews-vectors-negative300.bin.gz',
    size=300,
    iter=20,
    min_count=1,
    negative=10,
    workers=7,
    min_alpha=0.0001,
    window=5,
    binary=True,
):
    # https://github.com/RaRe-Technologies/gensim/issues/1245
    # List of tokenized questions.
    # e.g. ['What', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']
    # pre_trained_model can be any pre trained model that gensim accepts, e.g., Glove or GoogleNews word2vec

    # Initialize model
    word_vectors = Word2Vec(
        size=size, iter=iter, min_count=min_count, negative=negative, workers=workers,
        min_alpha=min_alpha, window=window,
    )

    # Initialize vocab
    word_vectors.build_vocab(tokenized_questions)

    # Initialize vectors in local model with with vectors from pre-trained model with overlapping vocabulary.
    # Set `lockf` to 1 for re-training
    word_vectors.intersect_word2vec_format(pre_trained_model, lockf=1, binary=binary)

    # Adjust pre-trained vectors to adapt its distribution with that of the local data via retraining.
    word_vectors.train(tokenized_questions)

    return word_vectors

In [8]:
# %%time
# start = datetime.now()
# word_vectors = train_word2vec(tokenized_questions)
# with open('word_vectors_done.lock', 'w') as fl:
#     fl.write('{}'.format((datetime.now() - start).total_seconds()))

CPU times: user 2h 6min, sys: 1min 42s, total: 2h 7min 42s
Wall time: 26min 8s


In [9]:
# %%time
# start = datetime.now()
# word_vectors_glove = train_word2vec(tokenized_questions, pre_trained_model='word2vec.glove.6B.300d.txt', binary=False)
# with open('word_vectors_glove.lock', 'w') as fl:
#     fl.write('{}'.format((datetime.now() - start).total_seconds()))

CPU times: user 2h 14min 7s, sys: 1min 47s, total: 2h 15min 54s
Wall time: 26min 50s


In [14]:
# %%time
# # Basic cleaning, removal of . and ? and replacing of other non alnum with space.
# word_vectors.save('word2vec_google_news_basic_cleaning.model')
# word_vectors_glove.save('word2vec_glove_basic_cleaning.model')

CPU times: user 3.21 s, sys: 388 ms, total: 3.6 s
Wall time: 3.96 s


In [12]:
len(word_vectors.wv.vocab) == len(word_vectors_glove.wv.vocab)

True

In [137]:
np.array([np.array([word_vectors[i] for i in st]).mean(axis=0) if st else np.zeros(300) for st in my_sentences[3285:3290]]).shape

(5, 300)

In [17]:
%%time

def get_word2vec_rep(model, raw_sentences):
    tokenized_sentences = raw_sentences.str.replace('?', '').replace('.', '').str.replace('\W', ' ').str.split()
    return np.array(
        [
            np.array([model[i] for i in st if i in model.wv.vocab]).mean(axis=0) if st else 
            np.random.randn(model.vector_size) for st in tokenized_sentences
        ]
    )

cos_dist = einsum_pairwise_cos_sim(
    np.array([np.array([word_vectors[i] for i in st]).mean(axis=0) if st else np.random.randn(300) for st in tokenized_questions[:1000]]),
    np.array([np.array([word_vectors[i] for i in st]).mean(axis=0) if st else np.random.randn(300) for st in tokenized_questions[1000:2000]]),
#     binary_input=False
)

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 72.8 ms


In [1]:
# print_question_pairs(train_df.head(20), sample=False)

In [19]:
%%time
cos_dist_gn = einsum_pairwise_cos_sim(
    get_word2vec_rep(word_vectors, train_df.head(1000).question1),
    get_word2vec_rep(word_vectors, train_df.head(1000).question2)
)

cos_dist_glove = einsum_pairwise_cos_sim(
    get_word2vec_rep(word_vectors_glove, train_df.head(1000).question1),
    get_word2vec_rep(word_vectors_glove, train_df.head(1000).question2)
)

CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 157 ms


In [63]:
cos_dist_gn[205:210], train_df[205:210].is_duplicate

(array([ 0.78194499,  0.77948368,  0.73246109,  0.78725821,  1.        ], dtype=float32),
 205    0
 206    0
 207    0
 208    0
 209    1
 Name: is_duplicate, dtype: int64)

In [60]:
log_loss(train_df.head(210).is_duplicate, np.nan_to_num(cos_dist_gn[:210]))

nan

In [23]:
np.corrcoef(cos_dist_glove, train_df.head(1000).is_duplicate)

array([[ 1.        ,  0.34524814],
       [ 0.34524814,  1.        ]])

In [24]:
word_vectors.similarity('mail', 'email')

0.58339193749247253

In [75]:
word_vectors.most_similar('numpy')

[('urllib', 0.5968994498252869),
 ('NumPy', 0.5179972648620605),
 ('urllib2', 0.5177142024040222),
 ('vpython', 0.5115652680397034),
 ('Ironpython', 0.4793962836265564),
 ('LAPACK', 0.47677332162857056),
 ('dotfiles', 0.46835094690322876),
 ('BCM94352', 0.46812641620635986),
 ('XPlanner', 0.46804100275039673),
 ('hashable', 0.4638808071613312)]

In [3]:
# input_file     training file path (required)
# output         output file path (required)
# lr             learning rate [0.05]
# lr_update_rate change the rate of updates for the learning rate [100]
# dim            size of word vectors [100]
# ws             size of the context window [5]
# epoch          number of epochs [5]
# min_count      minimal number of word occurences [5]
# neg            number of negatives sampled [5]
# word_ngrams    max length of word ngram [1]
# loss           loss function {ns, hs, softmax} [ns]
# bucket         number of buckets [2000000]
# minn           min length of char ngram [3]
# maxn           max length of char ngram [6]
# thread         number of threads [12]
# t              sampling threshold [0.0001]
# silent         disable the log output from the C++ extension [1]
# encoding       specify input_file encoding [utf-8]

# Includes test data in unique_questions
# pd.Series(unique_questions[:100000]).to_csv('questions_data_1000.csv', index=False)
# NUM_PROC = 7
# model = fasttext.skipgram('questions_data.csv', 'model_full_data', dim=300, epoch=30, thread=NUM_PROC, word_ngrams=2)
# print model.words # list of words in dictionary

In [4]:
log_max_mem_usage()
model = fasttext.load_model('model_full_data.bin')
log_max_mem_usage()

Current all-time max memory: 136 MB
Current all-time max memory: 3345 MB


In [5]:
cosine_similarity(model["she is so India"], model["she's so India"])

array([[ 0.78911119]])

In [6]:
s1 = "she's so beautiful"
s2 = "she is pretty"
cosine_similarity(
    np.sum([model[i] for i in s1.split()], axis=0),
    np.sum([model[i] for i in s2.split()], axis=0),
#     model["she's so beautiful"]
)

array([[ 0.56863613]])

In [76]:
import gensim

In [78]:
def train_tfidf_lsi_models(tokenized_questions, num_topics=300, min_doc_freq=1, skip_terms=['']):
    MIN_DOC_FREQ = min_doc_freq
    NUM_TOPICS = num_topics

    dictionary = (
        gensim.corpora.Dictionary(
            tokenized_questions
        )
    )

    skip_terms = skip_terms

    skip_ids = [dictionary.token2id[t] for t in skip_terms if t in dictionary.token2id]
    low_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq < MIN_DOC_FREQ]

    dictionary.filter_tokens(low_freq_ids + skip_ids)
    dictionary.compactify()

    uid_corpus = (
        {
            uid: dictionary.doc2bow(
                tq
            ) for uid, tq in enumerate(tokenized_questions)
        }
    )

    tfidf = gensim.models.TfidfModel(uid_corpus.values(), id2word=dictionary)
    lsi = gensim.models.LsiModel(tfidf[uid_corpus.values()], id2word=dictionary, num_topics=NUM_TOPICS)

    return lsi, tfidf, uid_corpus

In [None]:
# %%time
# ~4 hours to run
# lsi, tfidf, uid_corpus = train_tfidf_lsi_models(tokenized_questions)

In [None]:
# %%time
# lsi.save('lsi_basic_cleaning.model')
# tfidf.save('tfidf_basic_cleaning.model')

In [104]:
lsi[tfidf[uid_corpus.values()[:2]]]

<gensim.interfaces.TransformedCorpus at 0x7fdb6b980750>

In [130]:
def lsi_transform(text, tfidf, lsi):
    # text should be a dataframe row of string
    db = [lsi.id2word.doc2bow(tokens) for tokens in tokenize_text(text)]
    return np.array([zip(*c)[1] for c in lsi[tfidf[db]]])

In [132]:
%%time
np.corrcoef(
    train_df.head(1000).is_duplicate,
    einsum_pairwise_cos_sim(
        lsi_transform(train_df.head(1000).question1, tfidf, lsi),
        lsi_transform(train_df.head(1000).question2, tfidf, lsi)
    )
)

CPU times: user 308 ms, sys: 12 ms, total: 320 ms
Wall time: 305 ms


array([[ 1.        ,  0.30663122],
       [ 0.30663122,  1.        ]])

In [119]:
tc = lsi_transform(train_df.head(10).question2, tfidf, lsi)

In [139]:
for i in tfidf[[lsi.id2word.doc2bow(tokens) for tokens in tokenize_text(train_df.head(1).question1)]][0]:
    print tfidf.id2word[i[0]], i

in (30934, 0.1417583469020468)
What (32845, 0.04838275493169238)
guide (48775, 0.3832127702788639)
step (48851, 0.6850161373260468)
is (54136, 0.06737383155472171)
india (75303, 0.247632307647888)
by (105421, 0.18565958961538986)
the (107952, 0.049689737134541064)
invest (120282, 0.2923475709978697)
to (125689, 0.07284383639019286)
market (152422, 0.2739097420428594)
share (159705, 0.3047377966227719)


4789032