In [1]:
import spacy
import spacy.cli
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from joblib import Parallel, delayed

In [2]:
# load pickle to dataframe
filename = 'hansard-speeches-post2010.pkl'
with open(filename, 'rb') as f:
    df = pickle.load(f)

In [8]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable=["parser"])

In [13]:
def lemmatize(doc, stop_words=nlp.Defaults.stop_words):
    '''Takes a spacy doc and lemmatizes each of the tokens
    
    Takes alphanumeric tokens whose lowercase form does not appear in stop_words
    and returns their lowercase lemmatized form
    '''
    lemmas = [token.lemma_.lower() for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return lemmas

def process_chunk(texts, batch_size=100):
    '''returns list, each item of which is a list of lemmas'''
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        preproc_pipe.append(lemmatize(doc))
    return preproc_pipe

In [None]:
# below follows code for parallelization: not supported on GPU

def chunker(iterable, total_length, chunksize):
    '''Takes an iterable and returns a list of chunks of the iterable'''
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    '''Flatten a list of lists to a list'''
    return [item for sublist in list_of_lists for item in sublist]

def process_parallel(texts, total_length, chunksize=100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer='processes')
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, total_length=total_length, chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [15]:
%timeit a = process_chunk(df.sample(10000).speech, batch_size=1000)

45.1 s ± 556 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
((len(df)/10000)*45.1)/60

47.945509333333334

In [17]:
df['lemmas'] = process_chunk(df.speech, batch_size=500)

In [19]:
with open('hansard-speeches-post2010-lemmatized.pkl', 'wb') as f:
    pickle.dump(df, f)