In [1]:
import os
import math
import struct
from collections import Counter
from tqdm import tqdm

In [None]:
# === Parameters ===
input_days = ["day_0", "day_1", "day_2"]
input_dir = "./"
min_freq = 1
output_dir = "./"
os.makedirs(output_dir, exist_ok=True)

dense_dim = 13
cat_dim = 26

# === Step 1: Count categorical value frequencies ===
cat_counters = [Counter() for _ in range(cat_dim)]

print("Pass 1: Counting categorical value frequencies...")
for fname in input_days:
    file_path = os.path.join(input_dir, fname)
    with open(file_path, 'r') as f:
        for line in tqdm(f, desc=f"Scanning {fname}"):
            fields = line.strip().split('\t')
            if len(fields) < 1 + dense_dim + cat_dim:
                continue
            cat_fields = fields[1 + dense_dim : 1 + dense_dim + cat_dim]
            for i, val in enumerate(cat_fields):
                try:
                    cat_int = int(val, 16)
                except:
                    cat_int = 0
                cat_counters[i][cat_int] += 1

# === Step 2: Build vocab mappings ===
print("Building vocabularies...")
cat_vocabs = []
for i in range(cat_dim):
    vocab = {}
    idx = 1  # 0 = <unk>
    for val, freq in cat_counters[i].items():
        if freq >= min_freq:
            vocab[val] = idx
            idx += 1
    cat_vocabs.append(vocab)


Pass 1: Counting categorical value frequencies...


Scanning day_0: 195841983it [47:44, 68359.16it/s]
Scanning day_1: 199563535it [49:51, 66708.48it/s]
Scanning day_2: 196792019it [50:00, 65592.39it/s]


Building vocabularies...
Pass 2: Processing and writing binary files...


Processing day_0: 21it [00:00, 27035.11it/s]


ValueError: math domain error

In [5]:

# === Step 3: Open binary output files ===
label_bin = open(os.path.join(output_dir, "labels.bin"), "wb")
dense_bin = open(os.path.join(output_dir, "dense.bin"), "wb")
cat_bin = open(os.path.join(output_dir, "categorical.bin"), "wb")

print("Pass 2: Processing and writing binary files...")
for fname in input_days:
    file_path = os.path.join(input_dir, fname)
    with open(file_path, 'r') as f:
        for line in tqdm(f, desc=f"Processing {fname}"):
            fields = line.strip().split('\t')
            if len(fields) < 1 + dense_dim + cat_dim:
                continue

            # === Label ===
            label = int(fields[0])
            label_bin.write(struct.pack('B', label))  # 1 byte

            # === Dense features ===
            dense_fields = fields[1 : 1 + dense_dim]
            for d in dense_fields:
                val = int(d) if d != '' else 0
                if val < 0:
                    val = 0 
                log_val = math.log(val + 1)
                dense_bin.write(struct.pack('f', log_val))  # 4-byte float

            # === Categorical features ===
            cat_fields = fields[1 + dense_dim : 1 + dense_dim + cat_dim]
            for i, val in enumerate(cat_fields):
                try:
                    cat_int = int(val, 16)
                except:
                    cat_int = 0
                idx = cat_vocabs[i].get(cat_int, 0)
                cat_bin.write(struct.pack('i', idx))  # 4-byte int

label_bin.close()
dense_bin.close()
cat_bin.close()
print("Done! Binary files saved to:", output_dir)

Pass 2: Processing and writing binary files...


Processing day_0: 195841983it [1:15:02, 43495.87it/s]
Processing day_1: 199563535it [1:16:43, 43349.61it/s]
Processing day_2: 196792019it [1:15:42, 43325.48it/s]

Done! Binary files saved to: ./



