<a href="https://colab.research.google.com/github/aayush-jain-dtu/inventory-stock-prediction/blob/main/data_preprocessing_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('inventory_dataset (3) (1).csv')

In [3]:
# --- Step 1: Column Removal and Initial Date Conversion ---

# Combine year, month, day into a single datetime object
df['order_date'] = pd.to_datetime({'year': df['order_year'], 'month': df['order_month'], 'day': df['order_day']})

# Drop unnecessary identifier columns (order_id, client_id, product_title)
df.drop(columns=['order_id', 'client_id', 'product_title'], inplace=True)

In [4]:
# --- Step 2: Feature Engineering (Trend and Cyclical) ---

# Calculate Days From Start (Trend)
start_date = df['order_date'].min()
df['days_from_start'] = (df['order_date'] - start_date).dt.days

# Calculate Week of Month (Cyclical)
df['week_of_month'] = (df['order_date'].dt.day - 1) // 7 + 1

In [5]:
# --- Step 3: One-Hot Encoding (OHE) ---

# Define columns for OHE (temporal and categorical)
ohe_cols = ['product_category', 'product_id', 'order_month', 'week_of_month']
for col in ohe_cols:
    df[col] = df[col].astype('category')
df = pd.get_dummies(df, columns=ohe_cols, drop_first=False)

# Remove original date components and the datetime object
df.drop(columns=['order_year', 'order_day', 'order_date'], inplace=True)

# Separate features (X) and raw target (y)
X = df.drop(columns=['quantity_ordered'])
y = df['quantity_ordered']

In [6]:
# --- Step 4: Chronological Train/Test Split (90/10 Ratio) ---

split_point = int(len(X) * 0.90)
X_train = X.iloc[:split_point]
X_test = X.iloc[split_point:]
y_train = y.iloc[:split_point] # Raw target values
y_test = y.iloc[split_point:] # Raw target values

In [7]:
# --- Step 5: Scaling (StandardScaler - CRUCIAL for Data Leakage Prevention) ---

numerical_cols = ['days_from_start', 'price_inr', 'current_product_stock']
scaler = StandardScaler()

# Fit scaler ONLY to the Training set (X_train)
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# Transform the Test set using the fitted Training parameters
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Scale the target variable y separately
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

# Convert scaled y arrays back to DataFrame for saving
y_train_df = pd.DataFrame(y_train_scaled, columns=['quantity_ordered_scaled'])
y_test_df = pd.DataFrame(y_test_scaled, columns=['quantity_ordered_scaled'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [8]:
# --- Step 6: Save Files to CSV ---

# Reset indices for clean CSV export
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

X_train.to_csv('X_train_features.csv', index=False)
X_test.to_csv('X_test_features.csv', index=False)
y_train_df.to_csv('y_train_target.csv', index=False)
y_test_df.to_csv('y_test_target.csv', index=False)