In [1]:
import pandas as pd
import numpy as np
import string
import polars as pl

In [2]:
num_rows = 2000000
num_cols = 5
str_lenght = 6
num_dataframes = 1

def make_df(num_rows: int, num_dtype, cat_dtype):
    # Generate numerical data
    numerical_data = np.random.randint(num_rows, size=(num_rows, 5))
    numerical_headers = [f'num_col_{item}' for item in range(5)]

    
    # Generate categorical data, then add to dataframe
    categories = np.random.choice(['A', 'B', 'C'], size=(num_rows, 3))
    categorical_data = {f"cat_col_{i}": categories[:, i] for i in range(3)}
    # Make df then coerce dtypes
    df = pd.DataFrame().from_dict({**categorical_data})
    df[numerical_headers] = pd.DataFrame(numerical_data).astype(num_dtype)
    df[list(categorical_data.keys())] = df[list(categorical_data.keys())].astype(cat_dtype)
    
    return df

def add_string_data(df, num_rows, num_cols, string_length, dtype):
    # Define the length of each random string
    random_chars = np.random.choice(list(string.ascii_letters), size=(num_rows, num_cols * string_length))
    s_matrix = pd.DataFrame(random_chars.view('U' + str(string_length)).reshape(num_rows, num_cols), dtype=dtype)
    s_columns = [f'str_col_{num}' for num in range(num_cols)]

    df[s_columns] = s_matrix
    return df

def write_datasets_to_disk(num_dataframes):
    keys = [f'dataset_{item}.parquet' for item in range(num_dataframes)]
    df_dict = {
        key: (make_df(num_rows, np.int64, 'category')
            .pipe(add_string_data, num_rows, num_cols, str_lenght, 'object')
            ) for key in keys
        }
    {key: df.to_parquet(key) for key, df in df_dict.items()}


In [3]:
write_datasets_to_disk(1)

In [4]:
df_np = pd.read_parquet('dataset_0.parquet', dtype_backend='numpy_nullable')
df_pa = pd.read_parquet('dataset_0.parquet', dtype_backend='pyarrow')
df_pl = pl.DataFrame(df_pa)

## Sjekk minnebruk per enkeltvariabel

In [5]:
np.divide(df_np.memory_usage(deep=True), len(df_np))

Index         0.000066
cat_col_0     1.000141
cat_col_1     1.000141
cat_col_2     1.000141
num_col_0     9.000000
num_col_1     9.000000
num_col_2     9.000000
num_col_3     9.000000
num_col_4     9.000000
str_col_0    63.000000
str_col_1    63.000000
str_col_2    63.000000
str_col_3    63.000000
str_col_4    63.000000
dtype: float64

In [6]:
np.divide(df_pa.memory_usage(deep=True), len(df_pa))

Index         0.000066
cat_col_0     4.000120
cat_col_1     4.000120
cat_col_2     4.000120
num_col_0     8.125000
num_col_1     8.125000
num_col_2     8.125000
num_col_3     8.125000
num_col_4     8.125000
str_col_0    10.000000
str_col_1    10.000000
str_col_2    10.000000
str_col_3    10.000000
str_col_4    10.000000
dtype: float64

In [7]:
{col: np.divide(df_pl[col].estimated_size(), len(df_pl)) for col in df_pl.columns}

{'cat_col_0': 4.0000175,
 'cat_col_1': 4.0000175,
 'cat_col_2': 4.0000175,
 'num_col_0': 8.0,
 'num_col_1': 8.0,
 'num_col_2': 8.0,
 'num_col_3': 8.0,
 'num_col_4': 8.0,
 'str_col_0': 14.000004,
 'str_col_1': 14.000004,
 'str_col_2': 14.000004,
 'str_col_3': 14.000004,
 'str_col_4': 14.000004}

## Total minnebruk

In [8]:
df_np.memory_usage(deep=True).sum()

726000978

In [9]:
df_pa.memory_usage(deep=True).sum()

205250852

In [10]:
df_pl.estimated_size()

244000145