<a href="https://colab.research.google.com/github/aneof/Cheat-sheets/blob/master/Pandas_memory_leak_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A quick test for memory leaks when deleting or overwriting large dataframes after a column modification. It was mostly an issue in Google Cloud AI Platform Notebooks. 

In [None]:
import pandas as pd
import os
import gc

In [None]:
# Source for cleaning dataframe memory
# https://stackoverflow.com/questions/39100971/how-do-i-release-memory-used-by-a-pandas-dataframe/49144260#49144260

In [None]:
# replace with large (>5gb) file of your choice
in_df = pd.read_parquet('Data/CafeMedia/cafemedia_parquet/html/day=2021-01-26.parquet')
print(in_df.shape)

In [None]:
# test 1 (works as long as there is no editing)

del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 2 (leaks 6gb on GCP)
# there's probably a leftover reference that doesn't allow memory to be freed

in_df['temp'] = in_df['html'].apply(lambda x: x[:1000])

del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 3 (leaks 6gb on GCP)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df
in_df=pd.DataFrame()
gc.collect()

# FIX (colab)
del htmls
htmls = []

In [None]:
# test 4 (leaks everything on GCP)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df['temp']
del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# A way to list large objects
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

# Garbage collection hotfix

In [None]:
# WARNING: THIS MAY BREAK PANDAS FUNCTIONALITY
# DON'T USE IT IF PANDAS ERRORS ARE ENCOUNTERED AFTERWARDS

# monkeypatches.py

# Solving memory leak problem in pandas
# https://github.com/pandas-dev/pandas/issues/2659#issuecomment-12021083
# Basically overwrites how Pandas' __del__ works in CPython

import pandas as pd
from ctypes import cdll, CDLL
import sys
try:
    cdll.LoadLibrary("libc.so.6")
    libc = CDLL("libc.so.6")
    libc.malloc_trim(0)
except (OSError, AttributeError):
    libc = None
4
__old_del = getattr(pd.DataFrame, '__del__', None)

def __new_del(self):
    if __old_del:
        __old_del(self)
    libc.malloc_trim(0)

if libc:
    print('Applying monkeypatch for pd.DataFrame.__del__', file=sys.stderr)
    pd.DataFrame.__del__ = __new_del
else:
    print('Skipping monkeypatch for pd.DataFrame.__del__: libc or malloc_trim() not found', file=sys.stderr)

In [None]:
# test 5 (with garbage collection hotfix) (leaks everything on GCP)

htmls = list(in_df['html'].values)

in_df['temp'] = [html[:1000] for html in htmls]

del in_df
in_df=pd.DataFrame()
gc.collect()
del in_df
in_df=pd.DataFrame()
gc.collect()

In [None]:
# test 6 (with garbage collection hotfix) (works great everywhere)
# gc needs to be called twice on GCP for some reason. It clears half the memory
# per call

in_df['temp'] = in_df['html'].apply(lambda x: x[:1000])

del in_df
in_df=pd.DataFrame()
gc.collect()
del in_df
in_df=pd.DataFrame()
gc.collect()