# Multiprocessing 

In [5]:
from __future__ import print_function, unicode_literals

from collections import defaultdict
import pkg_resources
pkg_resources.require('SpaCy==2.1.0')
import spacy
import neuralcoref
import pandas as pd
import multiprocessing
import numpy as np
import tqdm
import time

def apply_pipeline(text):
    text = pipe(text)
    return text

def load_processed_data(data="emails_processed.csv", nrows=5000):
    base_df = pd.read_csv(data, nrows=nrows)
    focused_df = base_df[['Message-ID', 'Subject']]
    return base_df, focused_df


def clean_threads(subject_df, subject_col='Subject'):
    subject_df = subject_df.astype(str)

    for index, row in subject_df.iterrows():
        row[subject_col] = row[subject_col].replace('Re:', '')
        row[subject_col] = row[subject_col].replace('re:', '')
        row[subject_col] = row[subject_col].replace('RE:', '')
        row[subject_col] = row[subject_col].replace('FW:', '')
        row[subject_col] = row[subject_col].strip()
    return subject_df


def get_threads(subject_df, base_df, subject_col='Subject', content_col='content'):
    subject_df = clean_threads(subject_df)

    # Group by subject
    subject_dict = subject_df.groupby(subject_col).groups

    # Get content for emails in same thread
    thread_dict = defaultdict(list)
    thread_dict_processed = defaultdict(list)
    for key, value in subject_dict.items():
        for value in subject_dict[key]:
            thread_dict[key].append(base_df.loc[value][content_col])

    for key in thread_dict.keys():
        thread_dict_processed[key] = ' '.join(thread_dict[key])

    return thread_dict_processed



In [6]:
nlp = spacy.load('en_core_web_sm')
# neuralcoref.add_to_pipe(nlp)

base_df, focused_df = load_processed_data(nrows=1000)
threads = get_threads(focused_df, base_df)

In [18]:
import numpy as np
import multiprocessing 
 
def parallelize(df, func):
    cores = multiprocessing.cpu_count()
    partitions = cores - 1
    data_split = np.array_split(df, partitions)
    pool = multiprocessing.Pool(cores)
    df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return df

In [14]:
def process_data(df):
    if 'processed_content' not in df:
        df['processed_content'] = None
        
    for i, rows in df.iterrows():
        df.at[i,'processed_content'] = nlp(df.at[i, 'content'])
        
    return df

In [None]:
parallelize(base_df[0:15], process_data)

In [None]:
start = time.time()
docs_processed = []
for doc in nlp.pipe(base_df['content'].tolist(), batch_size=1000, n_threads=5):
    docs_processed.append(doc)
print('Time elapsed: {} sec'.format((time.time()-start)))

In [None]:
# with mp.Pool(processes=2) as pool:
#     threads_processed = pool.map(apply_pipeline, [threads[key] for key in threads.keys()])
    
pool = Pool(processes=8)
for _ in tqdm.tqdm(pool.imap(nlp, [threads[key] for key in threads.keys()]), total=235):
    pass
pool.close()
pool.join()