In [1]:
#!/usr/bin/env python
# coding: utf8
"""
Example of multi-processing with Joblib and getting entity relations by thread.

Prerequisites: pip install joblib
"""
from __future__ import print_function, unicode_literals

from pathlib import Path
from joblib import Parallel, delayed
from functools import partial
import pkg_resources
pkg_resources.require('SpaCy==2.1.0')
import plac
import spacy
from spacy.util import minibatch
import neuralcoref
import pandas as pd


@plac.annotations(
    model=("Model name (needs tagger)", "positional", None, str),
    n_jobs=("Number of workers", "option", "n", int),
    batch_size=("Batch-size for each process", "option", "b", int),
    limit=("Limit of entries from the dataset", "option", "l", int),
)

def main(model="en_core_web_sm", n_jobs=5, batch_size=100, limit=100000):
    nlp = spacy.load(model)  # load spaCy model
    print("Loaded model '%s'" % model)
    # load and pre-process emails
    print("Loading data...")
    base_df, focused_df = load_process_data()
    texts = focused_df.ix[:,0].tolist()
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
    executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    do = delayed(partial(transform_texts, nlp))
    tasks = (do(i, batch) for i, batch in enumerate(partitions))
    executor(tasks)    


def load_process_data(data = "emails_processed.csv"):
    base_df = pd.read_csv(data)
    base_df = base_df.set_index('Message-ID', drop=True)
    base_df = base_df[0:1000]
    focused_df = base_df[['Subject']]
    return base_df, focused_df
    
    
def transform_texts(nlp, batch_id, texts):
    print(nlp.pipe_names)
    out_path = Path(output_dir) / ("%d.txt" % batch_id)
    print("Processing batch", batch_id)
    process_threads(subject_df = emails_df[['Subject']], base_df = emails_df)
    print("Processed {} emails in batch {}".format(len(texts), batch_id))

    
def process_threads(subject_df, base_df, subject_col='Subject', content_col='content'):
    subject_df = clean_threads(subject_df)
    
    # Group by subject
    subject_dict = subject_df.groupby(subject_col).groups
    
    # Get content for emails in same thread
    thread_dict = defaultdict(list)
    thread_dict_processed = defaultdict(list)
    for key, value in subject_dict.items():
        for value in subject_dict[key]:
            thread_dict[key].append(base_df.loc[value][content_col])

    for key in thread_dict.keys():
        thread_dict_processed[key] = ' '.join(thread_dict[key])
    
    # Pass processed strings to SpaCy pipeline
    for key in thread_dict_processed.keys():
        thread_dict_processed[key] = nlp(thread_dict_processed[key])
    
    return thread_dict_processed
    
    
def clean_threads(subject_df, subject_col='Subject'):
    subject_df = subject_df.astype(str)
    
    for index, row in subject_df.iterrows():
        row[subject_col] = row[subject_col].replace('Re:', '')
        row[subject_col] = row[subject_col].replace('re:', '')
        row[subject_col] = row[subject_col].replace('RE:', '')
        row[subject_col] = row[subject_col].replace('FW:', '')
        row[subject_col] = row[subject_col].strip()
    return subject_df


In [None]:
main()


Loaded model 'en_core_web_sm'
Loading data...


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Processing texts...
