In [1]:
%matplotlib inline

In [5]:
from kaggle_quora_question_pairs_common import *

dataset.hdf
sample_submission.csv
sample_submission.csv.zip
test.csv
test.csv.zip
train.csv
train.csv.zip





In [8]:
train_df, test_df = load_train_test()
train_df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
log_max_mem_usage()
train_df.shape

Current all-time max memory: 859 MB


(404290, 6)

In [27]:
%%time

include_test = True
unique_questions = get_unique_questions(train_df, test_df, include_test=include_test)

log_max_mem_usage()

Current all-time max memory: 13089 MB
CPU times: user 976 ms, sys: 8 ms, total: 984 ms
Wall time: 980 ms


In [38]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


stops = stopwords.words('english')
morphy_tag = {
    'NN': wordnet.NOUN,
    'JJ': wordnet.ADJ,
    'VB': wordnet.VERB,
    'VBN': wordnet.VERB,
    'RB': wordnet.ADV,
    'RBR': wordnet.ADV
}


def get_wordnet_pos(treebank_tag, default=wordnet.NOUN):
#     http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
#     http://stackoverflow.com/questions/7706696/how-can-i-best-determine-the-correct-capitalization-for-a-word
    tag = default
    
    if treebank_tag.startswith('J'):
        tag = wordnet.ADJ
    elif treebank_tag.startswith('V'):
        tag = wordnet.VERB
    elif treebank_tag.startswith('N'):
        tag = wordnet.NOUN
    elif treebank_tag.startswith('R'):
        tag = wordnet.ADV
    
    return tag


def get_lem_tag(tag):
    return get_wordnet_pos(tag, default=wordnet.NOUN)


lemmatizer = WordNetLemmatizer()

def lemmatize_word(word_tag):
    word, tag = word_tag
    tag = get_lem_tag(tag)
    return lemmatizer.lemmatize(word, pos=tag)


def lemmatize_words(words_tags):
    lemmas = []
    for word_tag in words_tags:
        lemmas.append(lemmatize_word(word_tag))
        
    return lemmas

In [None]:
corpus = .str.replace("'", '').str.replace('\W', ' ').str.lower().dropna()
lemmatized_corpus = corpus.str.split().map(lambda x: lemmatize_words(pos_tag(x)))

words = [
    word for response in lemmatized_corpus for word in response if word not in stops
]


In [375]:
import re


SPECIAL_TOKENS = {
    'quoted': 'quoted_item',
    'non-ascii': 'non_ascii_word',
    'undefined': 'something'
}

def clean_string(text, return_lower=True):
    
    def pad_str(s):
        return ' ' + s + ' '
    
    # Empty question
    
    if type(text) != str or text=='':
        return ''

    # preventing first and last word being ignored by regex    
    text = ' ' + text + ' '
    
    
    text = re.sub('\[math\].*\[\/math\]', 'mathformula', text)

    # Replace weird chars in text
    
    text = re.sub("’", "'", text) # special single quote
    text = re.sub("`", "'", text) # special single quote
    text = re.sub("“", '"', text) # special double quote
    text = re.sub("？", "?", text) 
    text = re.sub("…", " ", text) 
    text = re.sub("é", "e", text) 
    
    # Clean shorthands
    
    text = re.sub("\'s ", " is ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub(" i'm ", " I am ", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub(r"(\W|^)([0-9]+)[kK](\W|$)", r"\1\g<2>000\3", text) # better regex provided by @armamut
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)
    
#     # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word "number"
    
#     text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)
    
    # remove comma between numbers, i.e. 15,000 -> 15000
    
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    
#     # all numbers should separate from words, this is too aggressive
    
#     def pad_number(pattern):
#         matched_string = pattern.group(0)
#         return pad_str(matched_string)
#     text = re.sub('[0-9]+', pad_number, text)
    
    # add padding to punctuations and special chars, we still need them later
    
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    
    def pad_pattern(pattern):
        matched_string = pattern.group(0)
        return pad_str(matched_string)
    text = re.sub('''[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\\'\"]''', pad_pattern, text) 
        
    text = re.sub('[^\x00-\x7F]+', pad_str('specialchar'), text) # replace non-ascii word with special word
#     text = text.decode('utf-8')
    
    # indian dollar
    
    text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)
    
    # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text
    
    text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text)
    text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE)
    text = re.sub(r" india ", " India ", text)
    text = re.sub(r" switzerland ", " Switzerland ", text)
    text = re.sub(r" china ", " China ", text)
    text = re.sub(r" chinese ", " Chinese ", text) 
    text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE)
    text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE)
    text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE)
    text = re.sub(r" dms ", " direct messages ", text, flags=re.IGNORECASE)  
    text = re.sub(r" demonitization ", " demonetization ", text, flags=re.IGNORECASE) 
    text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE)
    text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE)
    text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE) 
    text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE)
    text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE)
    text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE)
    text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE)
    text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE)
    text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE)
    text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE)
    text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE)
    text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE)
    text = re.sub(r" III ", " 3 ", text)
    text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE)
    text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE)
    text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE)
    
    # typos identified with my eyes
    
    text = re.sub(r" quikly ", " quickly ", text)
    text = re.sub(r" unseccessful ", " unsuccessful ", text)
    text = re.sub(r" demoniti[\S]+ ", " demonetization ", text, flags=re.IGNORECASE)
    text = re.sub(r" demoneti[\S]+ ", " demonetization ", text, flags=re.IGNORECASE)  
    text = re.sub(r" addmision ", " admission ", text)
    text = re.sub(r" insititute ", " institute ", text)
    text = re.sub(r" connectionn ", " connection ", text)
    text = re.sub(r" permantley ", " permanently ", text)
    text = re.sub(r" sylabus ", " syllabus ", text)
    text = re.sub(r" sequrity ", " security ", text)
    text = re.sub(r" undergraduation ", " undergraduate ", text) # not typo, but GloVe can't find it
#     text = re.sub(r"(?=[a-zA-Z])ig ", "ing ", text)
    text = re.sub(r" latop", " laptop", text)
    text = re.sub(r" programmning ", " programming ", text)  
    text = re.sub(r" begineer ", " beginner ", text)  
    text = re.sub(r" qoura ", " Quora ", text)
    text = re.sub(r" wtiter ", " writer ", text)  
    text = re.sub(r" litrate ", " literate ", text)  

    # the single 's' in this stage is 99% of not clean text, just kill it
    text = re.sub(' s ', " ", text)
    text = re.sub('\W', " ", text)
    
    # reduce extra spaces into single spaces
    text = re.sub('[\s]+', " ", text)
    text = text.strip()
    
    return text if not return_lower else text.lower()


def clean_tokenize_lemmatize(text, return_tokens=True, return_lower=True):
    tokens = lemmatize_words(pos_tag(word_tokenize(clean_string(text, return_lower))))
    
    if not return_tokens:
        tokens = ' '.join(tokens)

    return tokens

In [410]:
unique_questions[unique_questions.str.contains("n't ")]

58         I was suddenly logged off Gmail. I can't remem...
118        What are some mind-blowing computer tools that...
211        How headphones work as an Antenna to play FM r...
241        There are 8 balls. 7 of them weigh the same. 1...
270               Why aren't there more apps like Word Lens?
281        How do I get over a friend with whom I haven't...
315        What does this saying mean; "Don't trust every...
329        How do I see "sent invitations" on Linkedin if...
342        Why do some Japanese guys try to look feminine...
405        What are the signs that a guy has feelings for...
418                          Why can't I stop watching porn?
435        Should I learn AngularJS 1.5 instead of Angula...
436                   Why can't we fall asleep on some days?
469        My new Xbox one S can't connect to internet du...
521              Why can't I feel remorse or empathy at all?
534        If you have me, you want to share me. If you s...
665        Why hasn't Ga

In [411]:
unique_questions[unique_questions.str.contains("couldn ")]

537942              Is backward couldn time travel possible?
538136     Why does the couldn Facebook "add friend" butt...
538636     Career couldn Advice: What are the career opti...
541201     What is the couldn best way to use a credit card?
545840     Are couldn sable German Shepherds found in India?
546363     What's the best Windows for gaming, couldn XP,...
547614                              Do couldn people change?
547817     What is the reason for demonetization of Rs. 5...
547888     Is there a way to lock downloaded apps like Wh...
550341     How can I come first in a debate couldn to be ...
552154                        How do couldn I learn quickly?
553527                            What is couldn horsepower?
557836     What would be the role couldn of chemical engi...
559165               Will Pokémon GO launch couldn in India?
559652     What are the key couldn performance indicators...
561653     How do I find an Euler Circuit couldn in a gra...
561993              How 

In [412]:
unique_questions[unique_questions.str.contains("wouldn ")]

537446     My gpa wouldn is very bad.. but i have done a ...
537606     What would happen if the Earth slowly stopped ...
543366     What is the difference between a file system w...
543694     Where should I start wouldn from for CAT prepa...
547329       Is it viable wouldn to flirt in the book store?
548011                                Does God wouldn exist?
548268     Who are the top wouldn writers on Quora globally?
549239      Do you know Michiru Morisaki the wouldn AV Star?
550293     My question is about wouldn Axiomatic set theo...
551401     Does wouldn Mad Max: Fury Road have post-credi...
552536               D2k best institute in wouldn hyderabad?
553478      Who are the best wouldn neurologists in Chennai?
553575     How can I as a medical student manage my time ...
554123               How do wouldn I live a enjoy full life?
554727     What wouldn has been your experience with Muhu...
556919            How wouldn do LED fluorescent lights work?
557284     How do wouldn

In [414]:
unique_questions[unique_questions.str.contains("didn ")]

537988           What's the best didn iOS speed-reading app?
538558     Instagram (product): How do I reactivate my In...
541460     Have didn you ever get rid of a bad habit, rea...
543736         Why do we didn use NaOH in organic reactions?
543914     Why is marble classified as an igneous didn rock?
544944           Can drinking milk cause didn kidney stones?
547071     Which homemade didn foods should be given to a...
548887                      What is the best didn job board?
548983                           How earn didn money online?
551336             How is castor oil used didn to grow hair?
553900                How can didn I learn English speaking?
555560     I am placed in Infosys, TCS, Cognizant and Wip...
556465     What is the purpose of levying additional exci...
558067     What are the standard courses in a five didn c...
559468     Do didn you think time travel can really be po...
560588              How didn do I convert MBOX to PST files?
563063        How many U

In [409]:
unique_questions[551534]

'Why is the world mustn so crazy about Pokemon go?'

In [327]:
unique_questions[unique_questions.str.contains('\[math\].*\[\/math\]')].shape

(7420,)

In [369]:
train_df.head(10).question1.map(clean_tokenize_lemmatize)

0    [what, be, the, step, by, step, guide, to, inv...
1    [what, be, the, story, of, kohinoor, koh, i, n...
2    [how, can, i, increase, the, speed, of, my, in...
3    [why, be, i, mentally, very, lonely, how, can,...
4    [which, one, dissolve, in, water, quickly, sug...
5    [astrology, i, be, a, capricorn, sun, cap, moo...
6                              [should, i, buy, tiago]
7                [how, can, i, be, a, good, geologist]
8    [when, do, you, use, specialchar, instead, of,...
9    [motorola, company, can, i, hack, my, charter,...
Name: question1, dtype: object

In [374]:
train_df.head(10).question1.map(clean_tokenize_lemmatize)[8]

['when', 'do', 'you', 'use', 'specialchar', 'instead', 'of', 'specialchar']

In [348]:
re.findall(r'[^\x00-\x7F]+', train_df.head(10).question1[8])  #  # .map(clean_tokenize_lemmatize)

['\xe3\x82\xb7', '\xe3\x81\x97']

In [314]:
%%time
symbols = Counter()

for l in unique_questions.head(100000).map(
    lambda x: clean_tokenize_lemmatize(x, return_tokens=False)
).str.replace('[a-z0-9\?\ ]', ''):
#     print l
    symbols.update(l)

symbols.most_common()

CPU times: user 1min 8s, sys: 324 ms, total: 1min 8s
Wall time: 1min 8s


In [315]:
symbols.most_common()

[(u',', 12588),
 (u'`', 10772),
 (u'.', 8460),
 (u'-', 4624),
 (u')', 3456),
 (u'(', 3444),
 (u'/', 2805),
 (u':', 1883),
 (u"'", 1874),
 (u'+', 779),
 (u']', 441),
 (u'[', 440),
 (u'^', 306),
 (u'\\', 301),
 (u'=', 268),
 (u'\u201d', 210),
 (u'}', 210),
 (u'{', 208),
 (u'!', 141),
 (u'*', 106),
 (u';', 99),
 (u'_', 98),
 (u'#', 89),
 (u'\u20b9', 86),
 (u'\u2018', 54),
 (u'|', 48),
 (u'<', 47),
 (u'>', 40),
 (u'\u2013', 37),
 (u'@', 34),
 (u'~', 23),
 (u'\u221a', 18),
 (u'\u0bcd', 14),
 (u'\xa3', 13),
 (u'\xb0', 12),
 (u'\xe1', 12),
 (u'\xd7', 12),
 (u'\u0bb2', 11),
 (u'\xed', 11),
 (u'\u0435', 11),
 (u'\u043d', 11),
 (u'\u0430', 10),
 (u'\xe3', 10),
 (u'\u093e', 9),
 (u'\u2014', 9),
 (u'\u043e', 8),
 (u'\u094d', 8),
 (u'\xfc', 8),
 (u'\u0930', 7),
 (u'\u26aa', 7),
 (u'\xf6', 7),
 (u'\u2033', 7),
 (u'\u0939', 7),
 (u'\u0441', 7),
 (u'\u0bae', 7),
 (u'\u0442', 7),
 (u'\u0bc1', 6),
 (u'\u20ac', 5),
 (u'\u2212', 5),
 (u'\xe4', 5),
 (u'\u094b', 5),
 (u'\u043b', 5),
 (u'\u3044', 5),
 (u'\u0

In [358]:
train_df[train_df.is_duplicate == 1]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
15,15,31,32,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,1
16,16,33,34,What does manipulation mean?,What does manipulation means?,1
18,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1
20,20,41,42,Why do rockets look white?,Why are rockets and boosters painted white?,1
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1


In [377]:
batch_size = 100000
partitions = (unique_questions.shape[0] + batch_size) / batch_size
partitions

48

In [390]:
# All function should have a global data and only the indices are passed by the parallelizer.
def dump_preprocessed_text(indices):
    unique_questions.ix[indices].map(lambda x: clean_tokenize_lemmatize(x, False)).to_csv(
        'corpus/lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv', index=False, mode='a', sep='$',
        encoding='utf-8'
    )
    return [1]

def parallel_func_void(data_index, func, batch, num_proc):
    batch_size = batch
    partitions = (data_index.shape[0] + batch_size) / batch_size    
    
    with Parallel(n_jobs=num_proc) as parallel:
        dataset = []
        is_break = False
        i = 0

        while not is_break:
            payload = []
            
            for j in xrange(num_proc):
                t_df = data_index[(i + j) * batch: (i + 1 + j) * batch]

                if len(t_df) <= 0:
                    is_break = True
                    continue

                payload.append(delayed(func)(t_df))
            print('Current batch in main thread: {}'.format((i + j) * batch))

            if payload:
                results = parallel(payload)
                dataset.extend(results)
                i += num_proc

#     return pd.concat(dataset)



In [391]:
%%time
parallel_func_void(unique_questions.index, dump_preprocessed_text, batch=20000, num_proc=7)

Current batch in main thread: 120000
Current batch in main thread: 260000
Current batch in main thread: 400000
Current batch in main thread: 540000
Current batch in main thread: 680000
Current batch in main thread: 820000
Current batch in main thread: 960000
Current batch in main thread: 1100000
Current batch in main thread: 1240000
Current batch in main thread: 1380000
Current batch in main thread: 1520000
Current batch in main thread: 1660000
Current batch in main thread: 1800000
Current batch in main thread: 1940000
Current batch in main thread: 2080000
Current batch in main thread: 2220000
Current batch in main thread: 2360000
Current batch in main thread: 2500000
Current batch in main thread: 2640000
Current batch in main thread: 2780000
Current batch in main thread: 2920000
Current batch in main thread: 3060000
Current batch in main thread: 3200000
Current batch in main thread: 3340000
Current batch in main thread: 3480000
Current batch in main thread: 3620000
Current batch in ma

TypeError: cannot concatenate a non-NDFrame object

In [277]:
# %%time
# print 'Total expected partitions to process: {}'.format(partitions)

# for ix, batch in enumerate(np.array_split(unique_questions, partitions)):
#     batch.map(lambda x: clean_tokenize_lemmatize(x, False)).to_csv(
#         'corpus/lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv', index=False, mode='a', sep='$$$$$',
#         encoding='utf-8'
#     )
#     print 'Processed partition: {}'.format(ix + 1)

Total expected partitions to process: 958
Processed partition: 1
Processed partition: 2
Processed partition: 3
Processed partition: 4
Processed partition: 5
Processed partition: 6
Processed partition: 7
Processed partition: 8
Processed partition: 9
Processed partition: 10
Processed partition: 11
Processed partition: 12
Processed partition: 13
Processed partition: 14
Processed partition: 15
Processed partition: 16
Processed partition: 17
Processed partition: 18
Processed partition: 19
Processed partition: 20
Processed partition: 21
Processed partition: 22
Processed partition: 23
Processed partition: 24
Processed partition: 25
Processed partition: 26
Processed partition: 27
Processed partition: 28
Processed partition: 29
Processed partition: 30
Processed partition: 31
Processed partition: 32
Processed partition: 33
Processed partition: 34
Processed partition: 35
Processed partition: 36
Processed partition: 37
Processed partition: 38
Processed partition: 39
Processed partition: 40
Process

Processed partition: 332
Processed partition: 333
Processed partition: 334
Processed partition: 335
Processed partition: 336
Processed partition: 337
Processed partition: 338
Processed partition: 339
Processed partition: 340
Processed partition: 341
Processed partition: 342
Processed partition: 343
Processed partition: 344
Processed partition: 345
Processed partition: 346
Processed partition: 347
Processed partition: 348
Processed partition: 349
Processed partition: 350
Processed partition: 351
Processed partition: 352
Processed partition: 353
Processed partition: 354
Processed partition: 355
Processed partition: 356
Processed partition: 357
Processed partition: 358
Processed partition: 359
Processed partition: 360
Processed partition: 361
Processed partition: 362
Processed partition: 363
Processed partition: 364
Processed partition: 365
Processed partition: 366
Processed partition: 367
Processed partition: 368
Processed partition: 369
Processed partition: 370
Processed partition: 371


Processed partition: 660
Processed partition: 661
Processed partition: 662
Processed partition: 663
Processed partition: 664
Processed partition: 665
Processed partition: 666
Processed partition: 667
Processed partition: 668
Processed partition: 669
Processed partition: 670
Processed partition: 671
Processed partition: 672
Processed partition: 673
Processed partition: 674
Processed partition: 675
Processed partition: 676
Processed partition: 677
Processed partition: 678
Processed partition: 679
Processed partition: 680
Processed partition: 681
Processed partition: 682
Processed partition: 683
Processed partition: 684
Processed partition: 685
Processed partition: 686
Processed partition: 687
Processed partition: 688
Processed partition: 689
Processed partition: 690
Processed partition: 691
Processed partition: 692
Processed partition: 693
Processed partition: 694
Processed partition: 695
Processed partition: 696
Processed partition: 697
Processed partition: 698
Processed partition: 699


In [394]:
q = pd.read_csv('corpus/lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv', delimiter='$', header=None)

In [399]:
q[0].fillna('xxx').str.split().map(len).describe()

count    4.789030e+06
mean     1.140584e+01
std      6.088868e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.000000e+01
75%      1.400000e+01
max      2.500000e+02
Name: 0, dtype: float64

In [13]:
%%time
from gensim.models import KeyedVectors

wvmodel = KeyedVectors.load_word2vec_format(
    '/home/avsolatorio/WORK/kaggle/pre-trained-models/GoogleNews-vectors-negative300.bin.gz', binary=True
)

CPU times: user 1min 7s, sys: 972 ms, total: 1min 8s
Wall time: 1min 8s


In [101]:
import enchant

In [102]:
d = enchant.Dict('en-us')

In [145]:
d.check('bedraggled')

True

In [382]:
len(wvmodel.vocab)

3000000

In [492]:
q

Unnamed: 0,0
0,can india make jet engine indigenously
1,what be the best and high pr social bookmarkin...
2,what be the best poem of all time in telugu
3,what be a way to obtain a blacklist of porn si...
4,how do we know if we be narcissistic
5,how do private bus operator in india make prof...
6,how much would a website like www picaroworld ...
7,i spend lot of time myself so i could finger o...
8,how do men feel about woman
9,in japanese what be the meaning of ka


In [490]:
noise_stops = []
for i, j in wvmodel.most_similar('couldn', topn=100):
    try:
        s = unique_questions[unique_questions.str.contains(u'{} '.format(i))].shape[0]
        if s:
            print i
            noise_stops.append(i)
    except:
        print 'err', i
        pass

didn
weren
wasn
shouldn
hadn
wouldn
hasn
aren
doesn
isn
couldnt
tI
t
err ***_ing
err ac_**
cant
err *_cked
did'nt


In [491]:
ls corpus

lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-2.w2v.model
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-2.w2v.model.syn1neg.npy
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-2.w2v.model.wv.syn0.npy
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-30.w2v.model
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-30.w2v.model.syn1neg.npy
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-30.w2v.model.wv.syn0.npy
lemmatized-normalized-cleaned-data-without-special-chars.with_test.csv.word2vec.glove.6B.300d.txt.epochs-5.w2v.model
lemmatized-normalized-cleaned-data-without-special-