<a href="https://colab.research.google.com/github/aneof/Cheat-sheets/blob/master/Pandas_memory_leak_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A quick test for memory leaks when deleting or overwriting large dataframes. It was mostly an issue in Google Cloud AI Platform Notebooks. 

In [None]:
import pandas as pd
import os
import gc

In [None]:
# replace with large (>5gb) file of your choice
in_df = pd.read_parquet('Data/CafeMedia/cafemedia_parquet/html/day=2021-01-26.parquet')
print(in_df.shape)
gc.collect()

In [None]:
# test 1 (works)

del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 2 (leaks 6gb)

in_df['temp'] = in_df['html'].apply(lambda x: x[:1000])

del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 3 (leaks 6gb)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df
in_df=pd.DataFrame()
gc.collect()

# FIX (colab)
del htmls
htmls = []

In [None]:
# test 4 (leaks everything)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df['temp']
del in_df
in_df=pd.DataFrame()
gc.collect()

# Garbage collection hotfix

In [None]:
# WARNING: THIS MAY BREAK PANDAS FUNCTIONALITY
# DON'T USE IT IF PANDAS ERRORS ARE ENCOUNTERED AFTERWARDS

# monkeypatches.py

# Solving memory leak problem in pandas
# https://github.com/pandas-dev/pandas/issues/2659#issuecomment-12021083
import pandas as pd
from ctypes import cdll, CDLL
import sys
try:
    cdll.LoadLibrary("libc.so.6")
    libc = CDLL("libc.so.6")
    libc.malloc_trim(0)
except (OSError, AttributeError):
    libc = None
4
__old_del = getattr(pd.DataFrame, '__del__', None)

def __new_del(self):
    if __old_del:
        __old_del(self)
    libc.malloc_trim(0)

if libc:
    print('Applying monkeypatch for pd.DataFrame.__del__', file=sys.stderr)
    pd.DataFrame.__del__ = __new_del
else:
    print('Skipping monkeypatch for pd.DataFrame.__del__: libc or malloc_trim() not found', file=sys.stderr)

In [None]:
# test 5 (with garbage collection hotfix) (leaks everything)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df
in_df=pd.DataFrame()
gc.collect()
del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 6 (with garbage collection hotfix) (works great)

in_df['temp'] = in_df['html'].apply(lambda x: x[:1000])

del in_df
in_df=pd.DataFrame()
gc.collect()
del in_df
in_df=pd.DataFrame()
gc.collect()