Skip to content

Commit

Permalink
fix(util.pandas.optimise_df): Stricter processing for unsigned data t…
Browse files Browse the repository at this point in the history
…ypes.
  • Loading branch information
aaronmussig committed Apr 17, 2022
1 parent bc4945d commit 64063ff
Showing 1 changed file with 30 additions and 5 deletions.
35 changes: 30 additions & 5 deletions magna/util/pandas.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,39 @@
import numpy as np
import pandas as pd


def optimise_df(df: pd.DataFrame):
def optimise_df(df: pd.DataFrame, integers: bool = True, floats: bool = True):
"""Optimise a Pandas DataFrame by using the smallest possible data type.
Args:
df: The Pandas DataFrame to optimise.
integers: Whether to optimise the integers.
floats: Whether to optimise the floats.
"""
if floats:
float_cols = df.select_dtypes('float').columns
df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')

float_cols = df.select_dtypes('float').columns
int_cols = df.select_dtypes('integer').columns
df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
if integers:
sint_types = [np.int8, np.int16, np.int32, np.int64]
uint_types = [np.uint8, np.uint16, np.uint32, np.uint64]
sint_info = [np.iinfo(t) for t in sint_types]
uint_info = [np.iinfo(t) for t in uint_types]

int_cols = df.select_dtypes('integer').columns
for int_col in int_cols:
col_min, col_max = df[int_col].min(), df[int_col].max()

# Determine which data types to use
if col_min >= 0:
int_info = uint_info
else:
int_info = sint_info

# Set the data type
for info in int_info:
if col_min >= info.min and col_max <= info.max:
df[int_col] = df[int_col].astype(info.dtype)
break
else:
raise ValueError(f'Could not determine data type for column {int_col}')

0 comments on commit 64063ff

Please sign in to comment.