In [53]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.feature_selection import RFE

In [54]:
df = pd.read_csv("superstore_data.csv") 

In [55]:
df.head(10)

Unnamed: 0,Order ID,Order Date,Region,Category,Sub-Category,Sales,Profit,Quantity,Customer Segment
0,ORD1000,2023-04-26,East,Technology,Copiers,811.63,465.5,8,Home Office
1,ORD1001,2023-01-31,East,Furniture,Tables,1237.45,-90.79,3,Consumer
2,ORD1002,2023-03-31,West,Technology,Phones,289.95,121.47,5,Consumer
3,ORD1003,2023-09-29,East,Technology,Accessories,1242.12,295.65,6,Consumer
4,ORD1004,2023-08-30,South,Technology,Phones,207.18,305.4,6,Corporate
5,ORD1005,2023-01-31,Central,Furniture,Chairs,101.48,494.55,1,Corporate
6,ORD1006,2023-04-30,Central,Office Supplies,Binders,678.06,167.33,9,Corporate
7,ORD1007,2023-11-17,Central,Furniture,Tables,653.34,228.3,3,Corporate
8,ORD1008,2023-04-20,Central,Office Supplies,Paper,826.68,488.26,9,Home Office
9,ORD1009,2023-07-20,Central,Office Supplies,Binders,188.26,225.28,5,Home Office


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Order ID          500 non-null    object 
 1   Order Date        500 non-null    object 
 2   Region            500 non-null    object 
 3   Category          500 non-null    object 
 4   Sub-Category      500 non-null    object 
 5   Sales             500 non-null    float64
 6   Profit            500 non-null    float64
 7   Quantity          500 non-null    int64  
 8   Customer Segment  500 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 35.3+ KB


In [57]:
y = df['Sales']   # target

X = df.drop(columns=[
    'Sales',      # target must be removed
    'Order ID'    # ID columns must be removed
])

Separate Numerical & Categorical Features

In [58]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print("Numerical:", num_cols)
print("Categorical:", cat_cols)

Numerical: Index(['Profit', 'Quantity'], dtype='object')
Categorical: Index(['Order Date', 'Region', 'Category', 'Sub-Category', 'Customer Segment'], dtype='object')


Feature Engineering

Handle Skewed Numeric Features

In [59]:
num_transformer = Pipeline([
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])

Encode Categorical Features

In [60]:
cat_transformer = OneHotEncoder(
    drop='first',
    handle_unknown='ignore'
)

In [61]:
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

Feature Selection — RFE

In [63]:
rfe = RFE(
    estimator=LinearRegression(),
    n_features_to_select=10
)

In [64]:
pipeline_rfe = Pipeline([
    ('preprocess', preprocessor),
    ('feature_select', rfe),
    ('model', LinearRegression())
])

Cross-Validation

In [65]:
cv_scores = cross_val_score(
    pipeline_rfe,
    X_train,
    y_train,
    cv=5,
    scoring='r2'
)

cv_scores.mean()



np.float64(-0.07361789674848103)

L1 (Lasso) Feature Selection

In [66]:
lasso_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', Lasso(alpha=0.01))
])

In [67]:
lasso_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [68]:
rfe_score = cross_val_score(
    pipeline_rfe, X_train, y_train, cv=5, scoring='r2'
).mean()

lasso_score = cross_val_score(
    lasso_pipeline, X_train, y_train, cv=5, scoring='r2'
).mean()

rfe_score, lasso_score

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


(np.float64(-0.07361789674848103), np.float64(-0.7942499058914415))

In [69]:
pipeline_rfe.fit(X_train, y_train)
test_score = pipeline_rfe.score(X_test, y_test)

test_score



0.028643354861418335