In [2]:
import numpy
import pandas
import os
import gc
import matplotlib.pyplot as plot
import unicodedata
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
# This is upposed to enhance default pandas display
pandas.set_option('display.width',120)


## Read text files 

In [3]:
def load_dataframe(file_name,title):
    print ('load %s' % file_name)
    df = pandas.read_pickle('../PandasStore/'+file_name+'.pkl')
    print('%s contains %d lines' % (title,len(df)))
    df.metadata={'title':title}
    return df

train_dataframe = load_dataframe('clean_training','Train')
challenge_dataframe = load_dataframe('clean_challenge','Challenge')

load clean_training
Train contains 404290 lines
load clean_challenge
Challenge contains 2345796 lines


In [4]:
train_dataframe.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
challenge_dataframe.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


## Compute TDF-IDF

In [6]:
all_train_questions = list(train_dataframe['question1'])+list(train_dataframe['question2'])

In [7]:
tfidfer = TfidfVectorizer()
all_train_tfidfs = tfidfer.fit_transform(all_train_questions)


In [8]:
len(tfidfer.get_feature_names())

86040

In [9]:
all_train_word2tfidf = dict(zip(tfidfer.get_feature_names(), tfidfer.idf_))

In [10]:
import spacy
from tqdm import tqdm
spacy_nlp = spacy.load('en_core_web_sm')

In [11]:
import time

import ray

def do_partial_nlp(questions,task_name):
    i = 0
    start = time.time()
    for question in tqdm(questions):
        i = i+1
        if i% 10000 == 0 :
            pos = spacy_nlp(question,disable=["parser"])
    return {
        'TaskName': task_name,
        'NbRows': i,
        'Duration':  round(time.time()-start,2)}

def do_full_nlp_with_progress(questions,task_name):
    start = time.time()
    for question in tqdm(questions):
            pos = spacy_nlp(question,disable=["parser"])
    return {
        'TaskName': task_name,
        'NbRows': len(questions),
        'Duration': round(time.time()-start,2)}

def do_full_nlp(questions,task_name):
    start = time.time()
    for question in questions:
            pos = spacy_nlp(question,disable=["parser"])
    return {
        'TaskName': task_name,
        'NbRows': len(questions),
        'Duration': round(time.time()-start,2)}

@ray.remote
def ray_do_nlp(questions,task_name):
    return do_nlp(questions,task_name)


detailed_durations = do_partial_nlp(train_dataframe['question1'],'train_question1')
print('\nTask time %.2f s' % detailed_durations['Duration'])
detailed_durations
# lost = do_full_nlp_with_progress(challenge_dataframe['question1'],'train_question1')
#lost
#lost = do_nlp(challenge_dataframe['question2'])

# do_nlp(train_dataframe['question2'])

100%|██████████| 404290/404290 [00:00<00:00, 1041096.55it/s]
Task time 0.39 s



{'TaskName': 'train_question1', 'NbRows': 404290, 'Duration': 0.39}

In [12]:
def show_progress(task_name,pos,block_size,nb_pos):
    step = block_size/nb_pos
    step_cur = int(pos/step)
    return task_name+':'+'|'.ljust(step_cur,'.')+'*'.ljust(nb_pos-step_cur,'.')+'|'

def do_range_nlp(questions,task_name_prefix,nb_blocks,num_block):
    start = time.time()
    nb_steps_in_progress = 20
    task_name =  task_name_prefix+':'+str(num_block)+'-'+str(nb_blocks)
    block_size = int(round(len(questions)/nb_blocks,0))
    min_row = block_size*num_block
    top_progress = int(block_size/nb_steps_in_progress)
    if num_block >= (nb_blocks-1):
        max_row = len(questions)
    else:
        max_row = block_size*(num_block+1)-1
    print( 'Block from %d to %d' % (min_row,max_row))
    pos = 0
    for i in range(min_row,max_row):
            pos = pos + 1
            spacy_nlp(questions[i],disable=["parser"])
            if pos % top_progress == 0:
                print(show_progress(task_name,pos,block_size,nb_steps_in_progress))
    return {
        'TaskName': task_name,
        'NbRows': max_row-min_row,
        'Duration': round(time.time()-start,2)}

@ray.remote
def ray_do_range_nlp(questions,task_name_prefix,nb_blocks,num_block):
    return do_range_nlp(questions,task_name_prefix,nb_blocks,num_block)


#info = do_range_nlp(train_dataframe['question1'],'pouet',10,0)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,1)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,2)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,3)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,4)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,5)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,6)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,7)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,8)
# info = do_range_nlp(train_dataframe['question1'],'pouet',10,9)

len(train_dataframe['question1'])



404290

In [None]:
info

In [13]:
ray.shutdown()
ray.init(num_cpus=6)
# ray.init(include_dashboard=False,num_cpus=6)

global_start = time.time()
ret = ray.get([
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,0),
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,1),
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,2),
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,3),
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,4),
    ray_do_range_nlp.remote(challenge_dataframe['question1'],'challenge_question1',6,5),
    ])
duration = time.time()-global_start
ray.shutdown()
print ('Global time %d %s' % (duration,ret))

2020-09-04 08:47:30,475	INFO resource_spec.py:223 -- Starting Ray with 18.95 GiB memory available for workers and up to 9.49 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-09-04 08:47:31,047	INFO services.py:1191 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
[2m[36m(pid=28006)[0m Block from 0 to 390965
[2m[36m(pid=28009)[0m Block from 390966 to 781931
[2m[36m(pid=28001)[0m Block from 781932 to 1172897
[2m[36m(pid=28003)[0m Block from 1172898 to 1563863
[2m[36m(pid=28010)[0m Block from 1563864 to 1954829
[2m[36m(pid=28008)[0m Block from 1954830 to 2345796


KeyboardInterrupt: 

In [15]:
ray.shutdown()

