Generating data for ~50mb csv file [1_000_000 rows]

In [None]:
import pandas as pd
import os
import random
from datetime import datetime
import numpy as np
from datetime import datetime

random.seed(42)

def generate_mock_csv(file_path, num_rows=1000000):
    log_id = np.random.randint(1, high=100_0000, size=(num_rows), dtype=int)
    user_id = np.random.randint(1, high=100_000, size=(num_rows), dtype=int)
    watch_time = np.random.randint(1, 180, size=(num_rows), dtype=int)

    random_seconds = np.random.randint(0, 24 * 60 * 60, size=num_rows)
    
    # Fast vectorized conversion using numpy datetime64
    base_timestamp = np.datetime64(datetime.now())
    timestamp = base_timestamp + random_seconds.astype('timedelta64[s]')
    timestamp = timestamp.reshape(-1)
    
    # ip_address = np.array([fake.ipv4() for _ in range(num_rows)]).reshape(-1,1)

    ip_address = np.random.randint(1,high=256, size=(num_rows),dtype=int)

    df = pd.DataFrame({
        "log_id": log_id,
        "user_id": user_id, 
        "watch_time(min)": watch_time,
        "timestamp": timestamp,
        "ip_address": ip_address
    })    

    df.to_csv(file_path, index=False)

def generate_multiple_files(output_dir, num_files=1):
    os.makedirs(output_dir, exist_ok=True)
    for i in range(1, num_files + 1):
        file_path = os.path.join(output_dir, f"user_activity_{i}.csv")
        # generate_mock_csv(file_path, num_rows=random.randint(3000, 8000))
        generate_mock_csv(file_path, num_rows=1000000)
        print(f"✅ Generated {file_path}")

if __name__ == "__main__":
    generate_multiple_files("data/mock_csvs", num_files=100)


✅ Generated data/mock_csvs/user_activity_1.csv
✅ Generated data/mock_csvs/user_activity_2.csv
✅ Generated data/mock_csvs/user_activity_3.csv
✅ Generated data/mock_csvs/user_activity_4.csv
✅ Generated data/mock_csvs/user_activity_5.csv
✅ Generated data/mock_csvs/user_activity_6.csv
✅ Generated data/mock_csvs/user_activity_7.csv
✅ Generated data/mock_csvs/user_activity_8.csv
✅ Generated data/mock_csvs/user_activity_9.csv
✅ Generated data/mock_csvs/user_activity_10.csv
✅ Generated data/mock_csvs/user_activity_11.csv
✅ Generated data/mock_csvs/user_activity_12.csv
✅ Generated data/mock_csvs/user_activity_13.csv
✅ Generated data/mock_csvs/user_activity_14.csv
✅ Generated data/mock_csvs/user_activity_15.csv
✅ Generated data/mock_csvs/user_activity_16.csv
✅ Generated data/mock_csvs/user_activity_17.csv
✅ Generated data/mock_csvs/user_activity_18.csv
✅ Generated data/mock_csvs/user_activity_19.csv
✅ Generated data/mock_csvs/user_activity_20.csv
✅ Generated data/mock_csvs/user_activity_21.csv
✅

Generating data for ~50mb csv file [10_000_000 rows]

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime


def generate_mock_csv(file_path, num_rows, id_offset=0):

    log_id = np.random.randint(1, high=10000001, size=num_rows) + id_offset  
    user_id = np.random.randint(1, high=1000001, size=num_rows)
    watch_time = np.random.randint(1, 181, size=num_rows)
    base_ts = np.datetime64(datetime.now())
    random_seconds = np.random.randint(0, 24*60*60, size=num_rows)
    timestamp = (base_ts + random_seconds.astype('timedelta64[s]')).reshape(-1)
    ip_address = np.random.randint(1, 256, size=num_rows)
    
    df = pd.DataFrame({
        "log_id": log_id,
        "user_id": user_id,
        "watch_time(min)": watch_time,
        "timestamp": timestamp,
        "ip_address": ip_address,
    })
    df.to_csv(file_path, index=False)

def generate_multiple_files(output_dir, num_files=100):
    os.makedirs(output_dir, exist_ok=True)
    RANGE_PER_FILE = 1_000_000
    for i in range(1, num_files+1):
        offset = (i - 1) * RANGE_PER_FILE
        file_path = os.path.join(output_dir, f"user_activity_{i}.csv")
        generate_mock_csv(file_path, num_rows=10_000_000, id_offset=offset)
        print(f"✅ Generated {file_path}  (IDs {offset+1}–{offset+RANGE_PER_FILE})")


In [24]:
if __name__ == "__main__":
    generate_multiple_files("data/mock_csvs_final", num_files=100)

✅ Generated data/mock_csvs_final/user_activity_1.csv  (IDs 1–1000000)
✅ Generated data/mock_csvs_final/user_activity_2.csv  (IDs 1000001–2000000)
✅ Generated data/mock_csvs_final/user_activity_3.csv  (IDs 2000001–3000000)
✅ Generated data/mock_csvs_final/user_activity_4.csv  (IDs 3000001–4000000)
✅ Generated data/mock_csvs_final/user_activity_5.csv  (IDs 4000001–5000000)
✅ Generated data/mock_csvs_final/user_activity_6.csv  (IDs 5000001–6000000)
✅ Generated data/mock_csvs_final/user_activity_7.csv  (IDs 6000001–7000000)
✅ Generated data/mock_csvs_final/user_activity_8.csv  (IDs 7000001–8000000)
✅ Generated data/mock_csvs_final/user_activity_9.csv  (IDs 8000001–9000000)
✅ Generated data/mock_csvs_final/user_activity_10.csv  (IDs 9000001–10000000)
✅ Generated data/mock_csvs_final/user_activity_11.csv  (IDs 10000001–11000000)
✅ Generated data/mock_csvs_final/user_activity_12.csv  (IDs 11000001–12000000)
✅ Generated data/mock_csvs_final/user_activity_13.csv  (IDs 12000001–13000000)
✅ Gener