In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Number of records
num_orders = 200000

# Generate data
orders = pd.DataFrame({
    "order_id": range(1, num_orders + 1),
    "customer_id": np.random.randint(1, 50001, num_orders),
    "product_id": np.random.randint(1, 5001, num_orders),
    "order_date": pd.to_datetime("2022-01-01") +
                  pd.to_timedelta(np.random.randint(0, 730, num_orders), unit="D"),
    "quantity": np.random.randint(1, 5, num_orders),
    "discount": np.round(np.random.uniform(0, 0.5, num_orders), 2),
    "order_status": np.random.choice(
        ["Delivered", "Cancelled", "Returned"],
        p=[0.85, 0.1, 0.05],
        size=num_orders
    )
})

# Create directories if not exist
import os
os.makedirs("data/raw", exist_ok=True)

# Save dataset
orders.to_csv("data/raw/orders.csv", index=False)

orders.head()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,discount,order_status
0,1,15796,2346,2022-06-09,3,0.4,Delivered
1,2,861,989,2022-05-15,4,0.42,Delivered
2,3,38159,422,2023-10-26,3,0.02,Delivered
3,4,44733,1945,2022-02-11,1,0.24,Delivered
4,5,11285,878,2023-01-18,3,0.24,Delivered


In [None]:
orders.describe()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,discount
count,200000.0,200000.0,200000.0,200000,200000.0,200000.0
mean,100000.5,24960.785595,2496.91495,2022-12-30 20:03:05.904000,2.501435,0.250007
min,1.0,1.0,1.0,2022-01-01 00:00:00,1.0,0.0
25%,50000.75,12479.0,1247.0,2022-07-01 00:00:00,2.0,0.13
50%,100000.5,24862.0,2493.0,2022-12-30 00:00:00,3.0,0.25
75%,150000.25,37443.25,3742.0,2023-07-01 00:00:00,4.0,0.37
max,200000.0,50000.0,5000.0,2023-12-31 00:00:00,4.0,0.5
std,57735.171256,14416.422474,1441.329641,,1.117257,0.144288


In [None]:
import pandas as pd

orders = pd.read_csv("data/raw/orders.csv")

orders.head()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,discount,order_status
0,1,15796,2346,2022-06-09,3,0.4,Delivered
1,2,861,989,2022-05-15,4,0.42,Delivered
2,3,38159,422,2023-10-26,3,0.02,Delivered
3,4,44733,1945,2022-02-11,1,0.24,Delivered
4,5,11285,878,2023-01-18,3,0.24,Delivered


In [None]:
orders.shape

(200000, 7)

In [None]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      200000 non-null  int64  
 1   customer_id   200000 non-null  int64  
 2   product_id    200000 non-null  int64  
 3   order_date    200000 non-null  object 
 4   quantity      200000 non-null  int64  
 5   discount      200000 non-null  float64
 6   order_status  200000 non-null  object 
dtypes: float64(1), int64(4), object(2)
memory usage: 10.7+ MB


In [None]:
orders.isnull().sum()

Unnamed: 0,0
order_id,0
customer_id,0
product_id,0
order_date,0
quantity,0
discount,0
order_status,0
