In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load datasets
orders = pd.read_csv('../data/raw/orders.csv')
products = pd.read_csv('../data/raw/products.csv')
order_products_prior = pd.read_csv('../data/raw/order_products__prior.csv')
order_products_train = pd.read_csv('../data/raw/order_products__train.csv')

print("✓ Data loaded")
print(f"Orders: {orders.shape}")
print(f"Prior products: {order_products_prior.shape}")
print(f"Train products: {order_products_train.shape}")

✓ Data loaded
Orders: (3421083, 7)
Prior products: (32434489, 4)
Train products: (1384617, 4)


In [2]:
# Get train and test user IDs
train_users = orders[orders['eval_set'] == 'train']['user_id'].unique()
test_users = orders[orders['eval_set'] == 'test']['user_id'].unique()

print(f"Train users: {len(train_users)}")
print(f"Test users: {len(test_users)}")

# Merge prior products with orders to get user_id
prior_with_users = order_products_prior.merge(
    orders[['order_id', 'user_id']], 
    on='order_id'
)

# Get all unique user-product pairs for TRAIN users from their prior orders
train_user_products = prior_with_users[
    prior_with_users['user_id'].isin(train_users)
][['user_id', 'product_id']].drop_duplicates()

print(f"\nTrain user-product pairs: {len(train_user_products)}")

# Get products that were reordered in train orders
train_reordered = order_products_train[
    order_products_train['reordered'] == 1
].merge(
    orders[['order_id', 'user_id']], 
    on='order_id'
)[['user_id', 'product_id']].drop_duplicates()

print(f"Products reordered in train: {len(train_reordered)}")

Train users: 131209
Test users: 75000

Train user-product pairs: 8474661
Products reordered in train: 828824


In [3]:
# Create target labels using merge (much faster)
train_reordered['reordered'] = 1

# Merge to get labels
train_data = train_user_products.merge(
    train_reordered[['user_id', 'product_id', 'reordered']],
    on=['user_id', 'product_id'],
    how='left'
)

# Fill NaN with 0 (products not reordered)
train_data['reordered'] = train_data['reordered'].fillna(0).astype(int)

print(f"Training data shape: {train_data.shape}")
print(f"\nClass distribution:")
print(train_data['reordered'].value_counts())
print(f"\nPositive class: {train_data['reordered'].mean()*100:.2f}%")

Training data shape: (8474661, 3)

Class distribution:
reordered
0    7645837
1     828824
Name: count, dtype: int64

Positive class: 9.78%


In [4]:
# Get prior orders for train users only
train_prior_orders = orders[
    (orders['user_id'].isin(train_users)) & 
    (orders['eval_set'] == 'prior')
]

print(f"Prior orders for train users: {len(train_prior_orders)}")

# User-level features
user_features = train_prior_orders.groupby('user_id').agg({
    'order_id': 'count',  # total prior orders
    'days_since_prior_order': 'mean'  # avg days between orders
}).rename(columns={
    'order_id': 'user_total_orders',
    'days_since_prior_order': 'user_avg_days_between_orders'
})

print(f"\nUser features shape: {user_features.shape}")
print(user_features.head())

Prior orders for train users: 2047377

User features shape: (131209, 2)
         user_total_orders  user_avg_days_between_orders
user_id                                                 
1                       10                     19.555556
2                       14                     15.230769
5                        4                     13.333333
7                       20                     10.684211
8                        3                     30.000000


In [5]:
# User product behavior from prior orders
train_prior_products = prior_with_users[prior_with_users['user_id'].isin(train_users)]

user_product_features = train_prior_products.groupby('user_id').agg({
    'product_id': 'count',  # total products bought
    'reordered': 'sum'  # total reorders
}).rename(columns={
    'product_id': 'user_total_products',
    'reordered': 'user_total_reorders'
})

# Calculate user's reorder rate
user_product_features['user_reorder_rate'] = (
    user_product_features['user_total_reorders'] / 
    user_product_features['user_total_products']
)

# Average basket size
basket_sizes = train_prior_products.groupby('order_id').size()
order_to_user = train_prior_orders[['order_id', 'user_id']].set_index('order_id')
basket_sizes_df = basket_sizes.to_frame('basket_size').join(order_to_user)
user_basket_size = basket_sizes_df.groupby('user_id')['basket_size'].mean().rename('user_avg_basket_size')

print(f"User product features shape: {user_product_features.shape}")
print(user_product_features.head())
print(f"\nUser basket size shape: {user_basket_size.shape}")
print(user_basket_size.head())

User product features shape: (131209, 3)
         user_total_products  user_total_reorders  user_reorder_rate
user_id                                                             
1                         59                   41           0.694915
2                        195                   93           0.476923
5                         37                   14           0.378378
7                        206                  138           0.669903
8                         49                   13           0.265306

User basket size shape: (131209,)
user_id
1     5.900000
2    13.928571
5     9.250000
7    10.300000
8    16.333333
Name: user_avg_basket_size, dtype: float64


In [6]:
# Product popularity and reorder patterns across ALL prior orders
product_features = prior_with_users.groupby('product_id').agg({
    'order_id': 'count',  # how many times product was bought total
    'reordered': 'sum',  # how many times it was reordered
    'user_id': 'nunique'  # how many unique users bought it
}).rename(columns={
    'order_id': 'product_total_purchases',
    'reordered': 'product_total_reorders',
    'user_id': 'product_unique_users'
})

# Product reorder probability
product_features['product_reorder_probability'] = (
    product_features['product_total_reorders'] / 
    product_features['product_total_purchases']
)

print(f"Product features shape: {product_features.shape}")
print(product_features.head())
print("\nSummary stats:")
print(product_features.describe())

Product features shape: (49677, 4)
            product_total_purchases  product_total_reorders  \
product_id                                                    
1                              1852                    1136   
2                                90                      12   
3                               277                     203   
4                               329                     147   
5                                15                       9   

            product_unique_users  product_reorder_probability  
product_id                                                     
1                            716                     0.613391  
2                             78                     0.133333  
3                             74                     0.732852  
4                            182                     0.446809  
5                              6                     0.600000  

Summary stats:
       product_total_purchases  product_total_reorders  pro

In [7]:
# For each user-product pair, calculate their interaction history
user_product_interaction = train_prior_products.groupby(['user_id', 'product_id']).agg({
    'order_id': 'count',  # how many times THIS user bought THIS product
    'reordered': 'sum'  # how many times they reordered it (after first purchase)
}).rename(columns={
    'order_id': 'up_order_count',
    'reordered': 'up_reorder_count'
})

# Calculate user-product reorder rate
user_product_interaction['up_reorder_rate'] = (
    user_product_interaction['up_reorder_count'] / 
    user_product_interaction['up_order_count']
)

print(f"User-product interaction features shape: {user_product_interaction.shape}")
print(user_product_interaction.head(10))
print("\nSummary:")
print(user_product_interaction.describe())

User-product interaction features shape: (8474661, 3)
                    up_order_count  up_reorder_count  up_reorder_rate
user_id product_id                                                   
1       196                     10                 9         0.900000
        10258                    9                 8         0.888889
        10326                    1                 0         0.000000
        12427                   10                 9         0.900000
        13032                    3                 2         0.666667
        13176                    2                 1         0.500000
        14084                    1                 0         0.000000
        17122                    1                 0         0.000000
        25133                    8                 7         0.875000
        26088                    2                 1         0.500000

Summary:
       up_order_count  up_reorder_count  up_reorder_rate
count    8.474661e+06      8.474661e+06

In [8]:
# Get the last order where user bought each product
train_prior_products_sorted = train_prior_products.merge(
    train_prior_orders[['order_id', 'order_number']], 
    on='order_id'
).sort_values(['user_id', 'product_id', 'order_number'])

# Last order number where user bought this product
last_order_per_up = train_prior_products_sorted.groupby(['user_id', 'product_id'])['order_number'].last().rename('up_last_order_number')

# First order number where user bought this product  
first_order_per_up = train_prior_products_sorted.groupby(['user_id', 'product_id'])['order_number'].first().rename('up_first_order_number')

print(f"Last order features shape: {last_order_per_up.shape}")
print(last_order_per_up.head(10))
print(f"\nFirst order features shape: {first_order_per_up.shape}")
print(first_order_per_up.head(10))

Last order features shape: (8474661,)
user_id  product_id
1        196           10
         10258         10
         10326          5
         12427         10
         13032         10
         13176          5
         14084          1
         17122          5
         25133         10
         26088          2
Name: up_last_order_number, dtype: int64

First order features shape: (8474661,)
user_id  product_id
1        196           1
         10258         2
         10326         5
         12427         1
         13032         2
         13176         2
         14084         1
         17122         5
         25133         3
         26088         1
Name: up_first_order_number, dtype: int64


In [9]:
# Get the train order number for each user (their next order after all priors)
train_order_numbers = orders[orders['eval_set'] == 'train'][['user_id', 'order_number']].rename(
    columns={'order_number': 'train_order_number'}
)

print("Train order numbers:")
print(train_order_numbers.head())
print(f"Shape: {train_order_numbers.shape}")

Train order numbers:
    user_id  train_order_number
10        1                  11
25        2                  15
49        5                   5
74        7                  21
78        8                   4
Shape: (131209, 2)


In [10]:
# Merge to get train order number for each user-product pair
last_order_with_train = last_order_per_up.reset_index().merge(
    train_order_numbers,
    on='user_id'
)

# Calculate orders since last purchase
last_order_with_train['up_orders_since_last'] = (
    last_order_with_train['train_order_number'] - 
    last_order_with_train['up_last_order_number']
)

print("Orders since last purchase:")
print(last_order_with_train.head(10))
print(f"\nShape: {last_order_with_train.shape}")
print("\nSummary:")
print(last_order_with_train['up_orders_since_last'].describe())

Orders since last purchase:
   user_id  product_id  up_last_order_number  train_order_number  \
0        1         196                    10                  11   
1        1       10258                    10                  11   
2        1       10326                     5                  11   
3        1       12427                    10                  11   
4        1       13032                    10                  11   
5        1       13176                     5                  11   
6        1       14084                     1                  11   
7        1       17122                     5                  11   
8        1       25133                    10                  11   
9        1       26088                     2                  11   

   up_orders_since_last  
0                     1  
1                     1  
2                     6  
3                     1  
4                     1  
5                     6  
6                    10  
7              

In [11]:
# Start with train_data (user_id, product_id, reordered)
print(f"Starting with train_data: {train_data.shape}")

# Merge user features
train_data = train_data.merge(user_features, on='user_id', how='left')
train_data = train_data.merge(user_product_features, on='user_id', how='left')
train_data = train_data.merge(user_basket_size.to_frame(), on='user_id', how='left')

print(f"After user features: {train_data.shape}")

# Merge product features
train_data = train_data.merge(product_features, on='product_id', how='left')

print(f"After product features: {train_data.shape}")

# Merge user-product interaction features
train_data = train_data.merge(user_product_interaction, on=['user_id', 'product_id'], how='left')

print(f"After UP interaction: {train_data.shape}")

# Merge temporal features
train_data = train_data.merge(last_order_with_train[['user_id', 'product_id', 'up_orders_since_last']], 
                               on=['user_id', 'product_id'], how='left')

print(f"Final training data: {train_data.shape}")
print("\nColumns:")
print(train_data.columns.tolist())

Starting with train_data: (8474661, 3)
After user features: (8474661, 9)
After product features: (8474661, 13)
After UP interaction: (8474661, 16)
Final training data: (8474661, 17)

Columns:
['user_id', 'product_id', 'reordered', 'user_total_orders', 'user_avg_days_between_orders', 'user_total_products', 'user_total_reorders', 'user_reorder_rate', 'user_avg_basket_size', 'product_total_purchases', 'product_total_reorders', 'product_unique_users', 'product_reorder_probability', 'up_order_count', 'up_reorder_count', 'up_reorder_rate', 'up_orders_since_last']


In [12]:
# Check for missing values
print("Missing values:")
print(train_data.isnull().sum())
print("\n")

# Look at first few rows
print("First 10 rows:")
print(train_data.head(10))
print("\n")

# Check data types
print("Data types:")
print(train_data.dtypes)

Missing values:
user_id                         0
product_id                      0
reordered                       0
user_total_orders               0
user_avg_days_between_orders    0
user_total_products             0
user_total_reorders             0
user_reorder_rate               0
user_avg_basket_size            0
product_total_purchases         0
product_total_reorders          0
product_unique_users            0
product_reorder_probability     0
up_order_count                  0
up_reorder_count                0
up_reorder_rate                 0
up_orders_since_last            0
dtype: int64


First 10 rows:
   user_id  product_id  reordered  user_total_orders  \
0   202279       33120          1                  8   
1   202279       28985          0                  8   
2   202279        9327          0                  8   
3   202279       45918          0                  8   
4   202279       30035          0                  8   
5   202279       17794          0       

In [13]:
# Save to processed folder
output_path = Path('../data/processed')
output_path.mkdir(parents=True, exist_ok=True)

train_data.to_csv(output_path / 'train_features.csv', index=False)

print(f"✓ Saved training data: {train_data.shape}")
print(f"  File: {output_path / 'train_features.csv'}")

✓ Saved training data: (8474661, 17)
  File: ..\data\processed\train_features.csv


In [14]:
# Get test user-product pairs from their prior orders
test_prior_products = prior_with_users[prior_with_users['user_id'].isin(test_users)]

test_user_products = test_prior_products[['user_id', 'product_id']].drop_duplicates()

print(f"Test user-product pairs: {len(test_user_products)}")

# We need the same features for test users
# Start with user-level features for test users
test_prior_orders = orders[
    (orders['user_id'].isin(test_users)) & 
    (orders['eval_set'] == 'prior')
]

print(f"Test prior orders: {len(test_prior_orders)}")

Test user-product pairs: 4833292
Test prior orders: 1167497


In [15]:
# User-level features for test users
test_user_features = test_prior_orders.groupby('user_id').agg({
    'order_id': 'count',
    'days_since_prior_order': 'mean'
}).rename(columns={
    'order_id': 'user_total_orders',
    'days_since_prior_order': 'user_avg_days_between_orders'
})

# User product behavior for test users
test_user_product_features = test_prior_products.groupby('user_id').agg({
    'product_id': 'count',
    'reordered': 'sum'
}).rename(columns={
    'product_id': 'user_total_products',
    'reordered': 'user_total_reorders'
})

test_user_product_features['user_reorder_rate'] = (
    test_user_product_features['user_total_reorders'] / 
    test_user_product_features['user_total_products']
)

# Average basket size for test users
test_basket_sizes = test_prior_products.groupby('order_id').size()
test_order_to_user = test_prior_orders[['order_id', 'user_id']].set_index('order_id')
test_basket_sizes_df = test_basket_sizes.to_frame('basket_size').join(test_order_to_user)
test_user_basket_size = test_basket_sizes_df.groupby('user_id')['basket_size'].mean().rename('user_avg_basket_size')

print(f"Test user features: {test_user_features.shape}")
print(f"Test user product features: {test_user_product_features.shape}")
print(f"Test user basket size: {test_user_basket_size.shape}")

Test user features: (75000, 2)
Test user product features: (75000, 3)
Test user basket size: (75000,)


In [16]:
# User-product interaction for test users
test_up_interaction = test_prior_products.groupby(['user_id', 'product_id']).agg({
    'order_id': 'count',
    'reordered': 'sum'
}).rename(columns={
    'order_id': 'up_order_count',
    'reordered': 'up_reorder_count'
})

test_up_interaction['up_reorder_rate'] = (
    test_up_interaction['up_reorder_count'] / 
    test_up_interaction['up_order_count']
)

# Temporal features for test
test_prior_products_sorted = test_prior_products.merge(
    test_prior_orders[['order_id', 'order_number']], 
    on='order_id'
).sort_values(['user_id', 'product_id', 'order_number'])

test_last_order_per_up = test_prior_products_sorted.groupby(['user_id', 'product_id'])['order_number'].last().rename('up_last_order_number')

# Get test order numbers
test_order_numbers = orders[orders['eval_set'] == 'test'][['user_id', 'order_number']].rename(
    columns={'order_number': 'test_order_number'}
)

# Calculate orders since last purchase
test_last_order_with_next = test_last_order_per_up.reset_index().merge(
    test_order_numbers,
    on='user_id'
)

test_last_order_with_next['up_orders_since_last'] = (
    test_last_order_with_next['test_order_number'] - 
    test_last_order_with_next['up_last_order_number']
)

print(f"Test UP interaction: {test_up_interaction.shape}")
print(f"Test orders since last: {test_last_order_with_next.shape}")

Test UP interaction: (4833292, 3)
Test orders since last: (4833292, 5)


In [17]:
# Start with test user-product pairs
test_data = test_user_products.copy()

print(f"Starting with test_data: {test_data.shape}")

# Merge user features
test_data = test_data.merge(test_user_features, on='user_id', how='left')
test_data = test_data.merge(test_user_product_features, on='user_id', how='left')
test_data = test_data.merge(test_user_basket_size.to_frame(), on='user_id', how='left')

print(f"After user features: {test_data.shape}")

# Merge product features (same as train - products don't change)
test_data = test_data.merge(product_features, on='product_id', how='left')

print(f"After product features: {test_data.shape}")

# Merge user-product interaction features
test_data = test_data.merge(test_up_interaction, on=['user_id', 'product_id'], how='left')

print(f"After UP interaction: {test_data.shape}")

# Merge temporal features
test_data = test_data.merge(
    test_last_order_with_next[['user_id', 'product_id', 'up_orders_since_last']], 
    on=['user_id', 'product_id'], 
    how='left'
)

print(f"Final test data: {test_data.shape}")
print("\nColumns:")
print(test_data.columns.tolist())

Starting with test_data: (4833292, 2)
After user features: (4833292, 8)
After product features: (4833292, 12)
After UP interaction: (4833292, 15)
Final test data: (4833292, 16)

Columns:
['user_id', 'product_id', 'user_total_orders', 'user_avg_days_between_orders', 'user_total_products', 'user_total_reorders', 'user_reorder_rate', 'user_avg_basket_size', 'product_total_purchases', 'product_total_reorders', 'product_unique_users', 'product_reorder_probability', 'up_order_count', 'up_reorder_count', 'up_reorder_rate', 'up_orders_since_last']


In [18]:
# Check for missing values
print("Missing values in test:")
print(test_data.isnull().sum())
print("\n")

# Look at first few rows
print("First 5 rows:")
print(test_data.head())
print("\n")

# Compare train vs test feature distributions
print("Feature comparison:")
print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

Missing values in test:
user_id                         0
product_id                      0
user_total_orders               0
user_avg_days_between_orders    0
user_total_products             0
user_total_reorders             0
user_reorder_rate               0
user_avg_basket_size            0
product_total_purchases         0
product_total_reorders          0
product_unique_users            0
product_reorder_probability     0
up_order_count                  0
up_reorder_count                0
up_reorder_rate                 0
up_orders_since_last            0
dtype: int64


First 5 rows:
   user_id  product_id  user_total_orders  user_avg_days_between_orders  \
0    45082       17330                 10                      7.555556   
1    45082       27407                 10                      7.555556   
2    45082       35419                 10                      7.555556   
3    45082         196                 10                      7.555556   
4    45082       44635      

In [19]:
# Save test data
test_data.to_csv(output_path / 'test_features.csv', index=False)

print(f"✓ Saved test data: {test_data.shape}")
print(f"  File: {output_path / 'test_features.csv'}")
print("\n✓ Feature engineering complete!")
print(f"\nReady for modeling:")
print(f"  Train: {train_data.shape[0]:,} examples with {train_data.shape[1]-3} features")
print(f"  Test: {test_data.shape[0]:,} examples with {test_data.shape[1]-2} features")
print(f"  Target: {train_data['reordered'].mean()*100:.2f}% positive class")

✓ Saved test data: (4833292, 16)
  File: ..\data\processed\test_features.csv

✓ Feature engineering complete!

Ready for modeling:
  Train: 8,474,661 examples with 14 features
  Test: 4,833,292 examples with 14 features
  Target: 9.78% positive class
