# Multiprocessing

Does not work on Windows OS

In [1]:
from __future__ import print_function, unicode_literals

from collections import defaultdict
import pkg_resources
pkg_resources.require('SpaCy<=2.1.3')
import spacy
import neuralcoref
import pandas as pd
import multiprocessing as mp
import numpy as np
import tqdm
import time
import pickle

ENTITIES_OF_INTEREST = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE',
'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

def apply_pipeline(text):
    text = pipe(text)
    return text

def load_processed_data(data="emails_processed.csv", nrows=5000):
    base_df = pd.read_csv(data, nrows=nrows)
    focused_df = base_df[['Message-ID', 'Subject']]
    return base_df, focused_df


def clean_threads(subject_df, subject_col='Subject'):
    subject_df = subject_df.astype(str)

    for index, row in subject_df.iterrows():
        row[subject_col] = row[subject_col].replace('Re:', '')
        row[subject_col] = row[subject_col].replace('re:', '')
        row[subject_col] = row[subject_col].replace('RE:', '')
        row[subject_col] = row[subject_col].replace('FW:', '')
        row[subject_col] = row[subject_col].strip()
    return subject_df


def get_threads(subject_df, base_df, subject_col='Subject', content_col='content'):
    subject_df = clean_threads(subject_df)

    # Group by subject
    subject_dict = subject_df.groupby(subject_col).groups

    # Get content for emails in same thread
    thread_dict = defaultdict(list)
    thread_dict_processed = defaultdict(list)
    for key, value in subject_dict.items():
        for value in subject_dict[key]:
            thread_dict[key].append(base_df.loc[value][content_col])

    for key in thread_dict.keys():
        thread_dict_processed[key] = ' '.join(thread_dict[key])

    return thread_dict_processed


def parallelize_df(df, func, save=False, save_file_name=''):
    cores = mp.cpu_count()
    partitions = cores - 1
    df_split = np.array_split(df, partitions)
    pool = mp.Pool(cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    if save:
        df.to_pickle(save_file_name)
        print('Saved to disk!')
    else:
        return df


def process_neuralcoref(df):
#     if 'neuralcoref_content' not in df:
#         df['neuralcoref_content'] = None

    for idx, email in zip(df.index, nlp.pipe(df['content'].tolist(), batch_size=150)):
        df.at[idx, 'neuralcoref_content'] = email._.coref_resolved

    return df


def process_emails(df, entity_list=ENTITIES_OF_INTEREST):
    if 'processed_content' not in df:
        df['processed_content'] = None
        
    for i, rows in df.iterrows():
        df.at[i, 'processed_content'] = nlp(df.at[i, 'content'])
        for entity in entity_list:
            if entity in  [ent.label_ for ent in list(df.at[i, 'processed_content'].ents)]:
                df.at[i,entity] = 1
            else:
                df.at[i, entity] = 0
    
    return df

In [2]:
nlp = spacy.load('en_core_web_sm')

base_df, focused_df = load_processed_data(nrows=5000)

In [3]:
start_time = time.time()
processed_df = parallelize_df(base_df, process_emails)
print("{min} minutes".format(min = round((time.time()-start_time)/60,2)))

3.85 minutes


- 5000, 1.64 min
- 10000, 4.32 min

In [None]:
neuralcoref_df = processed_df.loc[processed_df['PERSON'] == 1]

In [4]:
start_time = time.time()
neuralcoref.add_to_pipe(nlp)

# nlp.remove_pipe('tagger')
# nlp.remove_pipe('ner')
# nlp.remove_pipe('parser')

parallelize_df(processed_df, process_neuralcoref, True, 'test.pkl')
print("{min} minutes".format(min = round((time.time()-start_time)/60,2)))

Saved to disk!
27.83 minutes


### Also works, but way less flexible:

In [None]:
# with mp.Pool(processes=2) as pool:
#     threads_processed = pool.map(apply_pipeline, [threads[key] for key in threads.keys()])
    
pool = Pool(processes=8)
for _ in tqdm.tqdm(pool.imap(nlp, [threads[key] for key in threads.keys()]), total=len(threads)):
    pass
pool.close()
pool.join()