In [1]:
import spacy
import spacy.cli
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from joblib import Parallel, delayed
import gc

In [2]:
# load pickle to dataframe
with open('speeches.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
# spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')#, disable=["parser"])

# Lemmatize

In [4]:
# common procedural word indicating MP standing up requesting to speak
nlp.Defaults.stop_words.add("rose")

In [5]:
def lemmatize(doc, stop_words=nlp.Defaults.stop_words):
    '''Takes a spacy doc and lemmatizes each of the tokens
    
    Takes alphanumeric tokens whose lowercase form does not appear in stop_words
    and returns their lowercase lemmatized form
    '''
    lemmas = [token.lemma_.lower() for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return lemmas

In [6]:
def process_chunk(texts, batch_size=50):
    '''returns list, each item of which is a list of lemmas'''
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        preproc_pipe.append(lemmatize(doc))
    return preproc_pipe

def chunker(iterable, chunksize):
    '''Takes an iterable and returns a list of chunks of the iterable'''
    return (iterable[pos: pos + chunksize] for pos in range(0, len(iterable), chunksize))

def flatten(list_of_lists):
    '''Flatten a list of lists to a list'''
    return [item for sublist in list_of_lists for item in sublist]

def process_parallel(texts, chunksize=100, n_jobs=-1):
    executor = Parallel(n_jobs=n_jobs, backend='multiprocessing', prefer='processes')
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [7]:
# for training on GPU - no parallelization
# df['lemmas'] = process_chunk(df.speech, batch_size=500)

In [8]:
%%time
# for training on CPU - parallelization
df['lemmas'] = process_parallel(df.speech, chunksize=100, n_jobs=15)

CPU times: user 37.1 s, sys: 6.18 s, total: 43.3 s
Wall time: 18min 26s


In [9]:
raw_len = len(df)
raw_len

609719

In [10]:
df = df[df.lemmas.apply(len) > 0]

In [11]:
print(raw_len - len(df))
print(np.around((raw_len - len(df)) / raw_len, 3))

11487
0.019


1.9% of contributions have no lemmas

In [12]:
with open('speeches-lemmatized.pkl', 'wb') as f:
    pickle.dump(df, f)