In [1]:
import numpy
import pandas
import os
import gc
import matplotlib.pyplot as plot
import unicodedata
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
# This is upposed to enhance default pandas display
pandas.set_option('display.width',120)


## Read text files 

In [2]:
def load_dataframe(file_name,title):
    print ('load %s' % file_name)
    df = pandas.read_pickle('../PandasStore/'+file_name+'.pkl')
    print('%s contains %d lines' % (title,len(df)))
    df.metadata={'title':title}
    return df

train_dataframe = load_dataframe('clean_training','Train')
challenge_dataframe = load_dataframe('clean_challenge','Challenge')

load clean_training
Train contains 404290 lines
load clean_challenge
Challenge contains 2345796 lines


In [3]:
train_dataframe.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
challenge_dataframe.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


## Compute TDF-IDF

In [5]:
all_train_questions = list(train_dataframe['question1'])+list(train_dataframe['question2'])

In [6]:
tfidfer = TfidfVectorizer()
all_train_tfidfs = tfidfer.fit_transform(all_train_questions)


In [7]:
len(tfidfer.get_feature_names())

86040

In [8]:
all_train_word2tfidf = dict(zip(tfidfer.get_feature_names(), tfidfer.idf_))

In [9]:
import spacy
from tqdm import tqdm
spacy_nlp = spacy.load('en_core_web_sm')

In [17]:
import time

import ray

def do_nlp(questions,task_name):
    i = 0
    start = time.time()
    for question in questions:
        i = i+1
        if i% 1000 == 0 :
            pos = spacy_nlp(question,disable=["parser"])
    return {
        'TaskName': task_name,
        'NbRows': i,
        'Duration': time.time()-start}


@ray.remote
def ray_do_nlp(questions,task_name):
    return do_nlp(questions,task_name)

#lost = do_nlp(train_dataframe['question1'])
#lost = do_nlp(train_dataframe['question2'])
lost = do_nlp(challenge_dataframe['question1'],'train_question1')
lost
#lost = do_nlp(challenge_dataframe['question2'])

# do_nlp(train_dataframe['question2'])

{'TaskName': 'train_question1',
 'NbRows': 2345796,
 'Duration': 11.199678659439087}

In [19]:
ray.shutdown()
ray.init(include_dashboard=False,num_cpus=8)

global_start = time.time()
ret = ray.get([
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ])
duration = time.time()-global_start
ray.shutdown()
print ('Global time %d %s' % (duration,ret))

2020-09-03 16:46:20,410	INFO resource_spec.py:223 -- Starting Ray with 29.05 GiB memory available for workers and up to 14.55 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
Global time 42 [{'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.490828275680542}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.430059432983398}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.475620746612549}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.369138717651367}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.568267583847046}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.649268388748169}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.711150407791138}, {'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 5.497262477874756}]


In [18]:
ray.shutdown()
ray.init(include_dashboard=False,num_cpus=8)

global_start = time.time()
ret = ray.get([
    ray_do_nlp.remote(train_dataframe['question1'],'train_question1'),
    ray_do_nlp.remote(train_dataframe['question2'],'train_question2'),
    ray_do_nlp.remote(challenge_dataframe['question1'],'challenge_question1'),
    ray_do_nlp.remote(challenge_dataframe['question2'],'challenge_question2'),
    ])
duration = time.time()-global_start
ray.shutdown()
print ('Global time %d %s' % (duration,ret))

2020-09-03 16:44:03,031	INFO resource_spec.py:223 -- Starting Ray with 29.1 GiB memory available for workers and up to 14.56 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
Global time 38 [{'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 3.5681653022766113}, {'TaskName': 'train_question2', 'NbRows': 404290, 'Duration': 3.471158027648926}, {'TaskName': 'challenge_question1', 'NbRows': 2345796, 'Duration': 15.67287802696228}, {'TaskName': 'challenge_question2', 'NbRows': 2345796, 'Duration': 15.658888816833496}]


In [13]:
ray.shutdown()



In [16]:
with ThreadPoolExecutor() as executor:
    results = executor.map(do_nlp,[train_dataframe['question1'],train_dataframe['question1']])
    for result in results:
        print(result)

    

404290
404290


In [20]:
for question in tqdm(train_dataframe['question1']):
    pos = spacy_nlp(question,disable=["parser"])

0%|          | 746/404290 [00:03<34:36, 194.30it/s]


KeyboardInterrupt: 