# Memory Benchmarks

1. exclusion of spacy components in pipeline
2. stream writes
3. str(np.object) -> pyarrow str -> categorical

In [1]:
%load_ext memory_profiler
import spacy

In [2]:
%%memit
nlp = spacy.load('en_core_web_sm')

peak memory: 285.05 MiB, increment: 77.94 MiB


In [3]:
%%memit
import pandas as pd

texts = pd.read_csv('~/Downloads/Geolocated_places_climate_with_LGA_and_remoteness_with_text.csv', usecols=['text'],
                    nrows=10_000).loc[:, 'text']

peak memory: 346.45 MiB, increment: 61.41 MiB


In [4]:
%%memit
# docs object all components included.
docs = list(nlp.pipe(texts))

peak memory: 1278.61 MiB, increment: 932.16 MiB


In [5]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
%%memit
nlp_lite = spacy.load('en_core_web_sm', exclude=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

peak memory: 1290.73 MiB, increment: 12.11 MiB


In [7]:
%%memit
docs2 = list(nlp_lite.pipe(texts))

peak memory: 1381.58 MiB, increment: 92.88 MiB


# Persisting to Disk (Stream)

In [8]:
import re

processed = texts.apply(lambda x: re.sub('\n', '', x))
processed.iloc[0]

'<TWEET> "Merry Crisis", "You cannot eat money", "Coal bludger", just some of the messages on show at the Solidarity Sit-Down outside Parliament House. The activists protesting over climate change inaction, they say is contributing to catastrophic bushfire conditions. @9NewsAdel <https://t.co/6qaEHbIXy5> </TWEET>'

In [9]:
processed.to_csv('/tmp/Geo_texts.txt', index=False, header=None, sep=' ')

In [10]:
# %%memit
def stream(path: str):
    with open(path, 'r') as h:
        for line in h:
            yield line


from tqdm import tqdm

In [11]:
%%memit
for doc in nlp_lite.pipe(stream("/tmp/Geo_texts.txt")):
    continue

peak memory: 1385.61 MiB, increment: 1.58 MiB


In [12]:
%%memit
with open("/tmp/Geo_texts_processeed.txt", 'w') as h:
    for doc in nlp_lite.pipe(stream("/tmp/Geo_texts.txt")):
        h.write(f"{len(doc)}\n")

peak memory: 1385.77 MiB, increment: 0.16 MiB


# Categorical dtype

In [21]:
%%memit
tweet_lgas = pd.read_csv('~/Downloads/Geolocated_places_climate_with_LGA_and_remoteness_with_text.csv', usecols=['tweet_lga'],
                    nrows=10_000)
# the supposed categorical column

peak memory: 1380.27 MiB, increment: -2.22 MiB


In [22]:
obj_bytes = tweet_lgas.memory_usage(deep=True).loc['tweet_lga']

f"Tweet_lga (np.object) uses {obj_bytes/1_000_000} mbytes." # default str in pandas

'Tweet_lga (np.object) uses 0.696822 mbytes.'

In [23]:
str_bytes = tweet_lgas.astype(dtype="string[pyarrow]").memory_usage(deep=True).loc['tweet_lga']
f"Tweet_lga (pyarrow) uses {str_bytes/1_000_000} mbytes."

'Tweet_lga (pyarrow) uses 0.166822 mbytes.'

In [24]:
cat_bytes = tweet_lgas.astype(dtype='category').memory_usage(deep=True).loc['tweet_lga']
f"Tweet_lga (categorical) uses {cat_bytes/1_000_000} mbytes."

'Tweet_lga (categorical) uses 0.021616 mbytes.'

In [25]:
f"From obj -> categorical: ~{obj_bytes/cat_bytes: .2f}x"

'From obj -> categorical: ~ 32.24x'

In [31]:
f"That was for {10_000} rows. Scaling that to 10x10^6 rows would be {obj_bytes/10_00} mbytes in obj; {obj_bytes/10_00/32} mbytes in categorical"

'That was for 10000 rows. Scaling that to 10x10^6 rows would be 696.822 mbytes in obj; 21.7756875 mbytes in categorical'