In [2]:
import pandas as pd

file_name = 'data_files/data.xlsx'
#updates_file = 'updates.xlsx'

df = pd.read_excel(file_name, sheet_name='Data')

In [4]:
import numpy as np

def optimize_dataframe(df):
    """
    This function optimizes the data types of columns in a pandas dataframe for more efficient memory usage.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe to optimize.
    
    Returns:
    df_optimized (pandas.DataFrame): The optimized dataframe.
    """
    
    # Get the initial memory usage of the dataframe
    initial_memory = df.memory_usage().sum() / (1024**2)
    print(f"Initial memory usage: {initial_memory:.2f} MB")
    
    # Get the data types of each column
    dtypes = df.dtypes
    
    # Create a dictionary to store the optimized data types
    optimized_dtypes = {}
    
    # Loop through each column
    for col, dtype in dtypes.iteritems():
        
        # Convert integer columns to smaller integer types if possible
        if dtype == 'int64':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                optimized_dtypes[col] = 'int8'
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                optimized_dtypes[col] = 'int16'
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                optimized_dtypes[col] = 'int32'
            else:
                optimized_dtypes[col] = 'int64'
        
        # Convert float columns to smaller float types if possible
        elif dtype == 'float64':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                optimized_dtypes[col] = 'float16'
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                optimized_dtypes[col] = 'float32'
            else:
                optimized_dtypes[col] = 'float64'
        
        # Keep non-numeric columns as is
        else:
            optimized_dtypes[col] = dtype
    
    # Optimize the dataframe by changing the data types of columns
    df_optimized = df.astype(optimized_dtypes)
    
    # Get the optimized memory usage of the dataframe
    optimized_memory = df_optimized.memory_usage().sum() / (1024**2)
    print(f"Optimized memory usage: {optimized_memory:.2f} MB")
    
    return df_optimized

In [5]:
better = optimize_dataframe(df)

Initial memory usage: 50.06 MB
Optimized memory usage: 45.13 MB


  for col, dtype in dtypes.iteritems():
