In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../Data/Raw/Sample - Superstore.csv', encoding="latin-1")

In [4]:
# Convert order date to numerical features
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Order_Year'] = df['Order Date'].dt.year
df['Order_Month'] = df['Order Date'].dt.month
df['Order_DayOfWeek'] = df['Order Date'].dt.dayofweek

# Select relevant features and drop nulls
features = ['Order_Year', 'Order_Month', 'Order_DayOfWeek', 'Ship Mode', 'Segment', 'Category']
target = 'Sales'

df_model = df[features + [target]].dropna()


In [None]:
X = df_model[features]
y = df_model[target]

# Define which columns are categorical
categorical_features = ['Ship Mode', 'Segment', 'Category']
numerical_features = ['Order_Year', 'Order_Month', 'Order_DayOfWeek']

# One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ],
    remainder='passthrough'  # Leave numeric features as they are
)

In [6]:
# Build pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)


In [None]:
print("🔍 Regression Results:")
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

🔍 Regression Results:
R² Score: 0.04149657746290902
RMSE: 752.4540977622844
