In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
try:
    df_train = pd.read_csv('processed_train.csv')
    df_test = pd.read_csv('processed_test.csv')
    test_id = df_test['id'] 
    print("Processed data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Processed data files not found. Run '01_data_prep_and_eda.py' first. ({e})")
    exit()

Processed data loaded successfully.


In [3]:
print("\nEngineering Price and Promotion Features...")
def engineer_price_features(df):
    """Calculates discount, discount ratio, and relative price features."""
    df['discount'] = df['base_price'] - df['checkout_price']
    df['discount_ratio'] = (df['discount'] / df['base_price']).replace([np.inf, -np.inf], 0)
    df['price_ratio'] = df['checkout_price'] / df['base_price']
    return df

df_train = engineer_price_features(df_train)
df_test = engineer_price_features(df_test)




Engineering Price and Promotion Features...


In [4]:
print("Engineering Historical Demand Features...")

center_meal_mean = df_train.groupby(['center_id', 'meal_id'])['log_num_orders'].mean().reset_index()
center_meal_mean.columns = ['center_id', 'meal_id', 'center_meal_log_mean_orders']

center_mean = df_train.groupby('center_id')['log_num_orders'].mean().reset_index()
center_mean.columns = ['center_id', 'center_log_mean_orders']

meal_mean = df_train.groupby('meal_id')['log_num_orders'].mean().reset_index()
meal_mean.columns = ['meal_id', 'meal_log_mean_orders']

df_train = pd.merge(df_train, center_meal_mean, on=['center_id', 'meal_id'], how='left')
df_test = pd.merge(df_test, center_meal_mean, on=['center_id', 'meal_id'], how='left')

df_train = pd.merge(df_train, center_mean, on='center_id', how='left')
df_test = pd.merge(df_test, center_mean, on='center_id', how='left')

df_train = pd.merge(df_train, meal_mean, on='meal_id', how='left')
df_test = pd.merge(df_test, meal_mean, on='meal_id', how='left')

overall_log_mean = df_train['log_num_orders'].mean()
df_test['center_meal_log_mean_orders'] = df_test['center_meal_log_mean_orders'].fillna(overall_log_mean)
df_test['center_log_mean_orders'] = df_test['center_log_mean_orders'].fillna(overall_log_mean)
df_test['meal_log_mean_orders'] = df_test['meal_log_mean_orders'].fillna(overall_log_mean)
print("Historical Demand Features created and NaN handled.")



Engineering Historical Demand Features...
Historical Demand Features created and NaN handled.


In [5]:
print("\nApplying Label Encoding to categorical IDs and types...")

categorical_cols = ['center_id', 'meal_id', 'center_type', 'region_code', 'category', 'cuisine', 'city_code']

for col in categorical_cols:
    le = LabelEncoder()
    combined_data = pd.concat([df_train[col], df_test[col]], ignore_index=True).astype(str)
    le.fit(combined_data)

    df_train[col] = le.transform(df_train[col].astype(str))
    df_test[col] = le.transform(df_test[col].astype(str))

print("Feature Engineering complete.")
print(f"New Training Features: {df_train.columns.tolist()}")



Applying Label Encoding to categorical IDs and types...
Feature Engineering complete.
New Training Features: ['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price', 'emailer_for_promotion', 'homepage_featured', 'num_orders', 'category', 'cuisine', 'city_code', 'region_code', 'center_type', 'op_area', 'log_num_orders', 'discount', 'discount_ratio', 'price_ratio', 'center_meal_log_mean_orders', 'center_log_mean_orders', 'meal_log_mean_orders']


In [None]:
df_train.to_csv('final_train_features.csv', index=False)
df_test.to_csv('final_test_features.csv', index=False)
test_id.to_csv('test_ids.csv', index=False)


Final feature sets saved. Proceed to '03_model_training_and_submission.py'.
