In [17]:
import os
import numpy as np
import pandas as pd
from faker import Faker
import pyarrow as pa
import pyarrow.parquet as pq
import os

os.makedirs("sample_data", exist_ok=True)

In [18]:

def generate_events(num_rows, output_path, chunk_size=1_000_000, seed=42):
    rng = np.random.default_rng(seed)
    fake = Faker()
    Faker.seed(seed)

    # Pools to sample from
    login_id_pool = np.array([fake.user_name() for _ in range(27_564)], dtype=object)
    device_pool   = np.array(['ios', 'android', 'ipad', 'desktop', 'laptop', 'pager', 'other'], dtype=object)
    country_pool  = np.array([fake.country() for _ in range(40)], dtype=object)

    # Time bounds (sample integer seconds)
    t0 = np.datetime64('2019-01-01T00:00:00')
    t1 = np.datetime64('2022-12-31T23:59:59')
    total_secs = int((t1 - t0) / np.timedelta64(1, 's'))

    # Ensure parent folder exists
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

    writer = None
    written = 0
    row_id_base = 1

    try:
        while written < num_rows:
            n = min(chunk_size, num_rows - written)

            # Start times
            start_offsets = rng.integers(0, total_secs + 1, size=n, dtype=np.int64)
            login_start = t0 + start_offsets.astype('timedelta64[s]')

            # Non-negative duration up to 4h
            dur_secs = rng.integers(0, 4 * 3600 + 1, size=n, dtype=np.int64)
            login_end = login_start + dur_secs.astype('timedelta64[s]')

            df = pd.DataFrame({
                'row_id': np.arange(row_id_base, row_id_base + n, dtype=np.int64),
                'login_id': login_id_pool[rng.integers(0, len(login_id_pool), size=n)],
                'device_type': device_pool[rng.integers(0, len(device_pool), size=n)],
                'login_start_time': login_start.astype('datetime64[ns]'),
                'login_end_time': login_end.astype('datetime64[ns]'),
                'estimated_country': country_pool[rng.integers(0, len(country_pool), size=n)],
                'login_successful_flag': rng.random(n) < 0.85,
            })

            # Smaller memory / file size
            df['device_type'] = df['device_type'].astype('category')
            df['estimated_country'] = df['estimated_country'].astype('category')

            # Append chunk as a row group
            table = pa.Table.from_pandas(df, preserve_index=False)
            if writer is None:
                writer = pq.ParquetWriter(output_path, table.schema, compression='snappy')
            writer.write_table(table)

            written += n
            row_id_base += n
    finally:
        if writer is not None:
            writer.close()

# ---- Example usage (start small to test) ----
generate_events(1_000_000, 'sample_data/login_events_np.parquet')




In [21]:
# sanity check
duckdb.sql("SELECT COUNT(*) FROM 'sample_data/login_events_np.parquet'").fetchall()
duckdb.sql("SELECT * FROM 'sample_data/login_events_np.parquet' LIMIT 5").df()

Unnamed: 0,row_id,login_id,device_type,login_start_time,login_end_time,estimated_country,login_successful_flag
0,1,irios,ios,2019-05-11 09:29:43,2019-05-11 11:51:42,Belgium,True
1,2,hillkimberly,pager,2022-02-04 17:59:41,2022-02-04 19:06:01,Saint Barthelemy,True
2,3,adavidson,ios,2021-08-14 07:53:44,2021-08-14 10:48:22,Austria,True
3,4,pinedakenneth,pager,2020-10-03 04:50:00,2020-10-03 06:01:54,Cayman Islands,True
4,5,michaelsummers,ipad,2020-09-24 15:14:46,2020-09-24 18:37:36,Turks and Caicos Islands,True
