In [2]:
import gc
import numpy as np
import pandas as pd

%%time
# inspired by https://www.kaggle.com/code/renatoreggiani/reduce-memory-usage-zzzs-cmi
# with tweaks determined by the selected polars loading strategy
# tweaks inspired by https://github.com/softhints/Pandas-Tutorials/blob/master/column/3.check-dtype-column-columns-pandas-dataframe.ipynb

def reduce_mem_usage(df, silent=True, allow_categorical=True):
    """ 
    Iterates through all the columns of a dataframe and downcasts the data type
     to reduce memory usage. Can also factorize categorical columns to integer dtype.
    """
    def _downcast_numeric(df, col, allow_categorical=allow_categorical):
        """
        Downcast a numeric series into either the smallest possible int dtype or a specified float dtype.
        """
        if not pd.api.types.is_sparse(df[col].dtype) and \
            not pd.api.types.is_numeric_dtype(df[col].dtype) and \
               not pd.api.types.is_datetime64_any_dtype(df[col].dtype):
            if not allow_categorical:
                codes, uniques = df[col].factorize()
                df[col] = pd.Series(data=codes, index=df[col].index)
                df[col] = _downcast_numeric(df, col)
        else:
            col_type = df[col].dtype
            if col_type != object and not pd.api.types.is_datetime64_any_dtype(col_type):
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)

    if silent is False:
        start_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    for col in df.columns:
        _downcast_numeric(df, col, allow_categorical)
    if silent is False:
        end_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
        print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=False):
    # Calls reduce_mem_usage on columns which have not yet been optimized
    if oldcols is not None:
        newcols = matrix.columns.difference(oldcols)
    else:
        newcols = matrix.columns
    matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical)
    oldcols = matrix.columns  # This is used to track which columns have already been downcast
    return matrix, oldcols


def list_if_not(s, dtype=str):
    # Puts a variable in a list if it is not already a list
    if type(s) not in (dtype, list):
        raise TypeError
    if (s != "") & (type(s) is not list):
        s = [s]
    return s

# Usage

In [5]:
# Create dataframe, 
d = {'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col2': [7, 8, 9]}
df = pd.DataFrame(data=d)

# Reduce mem usage of every dataframe's column by it's downcasting
df = reduce_mem_usage(df, silent=False)
oldcols = df.columns
gc.collect()

# Add new column and then reduce 
df['col4'] = [10, 11, 12]
df, oldcols = shrink_mem_new_cols(df, oldcols)  # Use this function periodically to downcast dtypes to save memory
gc.collect()

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 23.9%


5