In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/superstore-dataset/SampleSuperstore.csv


In [2]:
import pandas as pd
df = pd.read_csv('/kaggle/input/superstore-dataset/SampleSuperstore.csv')
print("Dataset loaded successfully!")

Dataset loaded successfully!


In [12]:
df['Sales'].describe()

count     9994.000000
mean       229.858001
std        623.245101
min          0.444000
25%         17.280000
50%         54.490000
75%        209.940000
max      22638.480000
Name: Sales, dtype: float64

In [3]:
print("shape: ", df.shape)
print("null values: ", df.isnull().sum())
print("total null values: ", df.isnull().sum().sum())
print("total duplicate values = ", df.duplicated().sum())

shape:  (9994, 13)
null values:  Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64
total null values:  0
total duplicate values =  17


In [4]:
duplicate_cols = []
for cols in df.columns:
    if df[cols].duplicated().sum() > 0:
        duplicate_cols.append(cols)

print(duplicate_cols)

['Ship Mode', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Category', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit']


In [5]:
df = df.drop_duplicates(subset=duplicate_cols, keep='first')

print("after dropping duplicates:", df.duplicated().sum())

after dropping duplicates: 0


In [6]:
print(df['Discount'].tail())

9989    0.2
9990    0.0
9991    0.2
9992    0.0
9993    0.0
Name: Discount, dtype: float64


In [7]:
df['Discount'] = df['Discount'] * 100
print(df['Discount'].tail())

9989    20.0
9990     0.0
9991    20.0
9992     0.0
9993     0.0
Name: Discount, dtype: float64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9977 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9977 non-null   object 
 1   Segment       9977 non-null   object 
 2   Country       9977 non-null   object 
 3   City          9977 non-null   object 
 4   State         9977 non-null   object 
 5   Postal Code   9977 non-null   int64  
 6   Region        9977 non-null   object 
 7   Category      9977 non-null   object 
 8   Sub-Category  9977 non-null   object 
 9   Sales         9977 non-null   float64
 10  Quantity      9977 non-null   int64  
 11  Discount      9977 non-null   float64
 12  Profit        9977 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1.1+ MB


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

In [12]:
for val in df['Profit']:
  if val > 0:
    df['if_profit'] = 'yes'
  else:
    df['if_profit'] = 'no'

print(df['if_profit'].head())

0    yes
1    yes
2    yes
3    yes
4    yes
Name: if_profit, dtype: object


In [14]:
X = df.drop(
    columns=['Country','State', 'Postal Code', 'Profit', 'Discount'], 
    axis=1)
y = df['Discount']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

print("X train = ", X_train.shape)
print("X Val = ", X_val.shape)
print("Y train = ", y_train.shape)
print("Y Val = ", y_val.shape)

X train =  (7981, 9)
X Val =  (1996, 9)
Y train =  (7981,)
Y Val =  (1996,)


In [15]:
print("X train = ", X_train.isnull().sum())
print("Y train = ",y_train.isnull().sum())

X train =  Ship Mode       0
Segment         0
City            0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
if_profit       0
dtype: int64
Y train =  0


In [19]:
print(X_train.columns)

Index(['Ship Mode', 'Segment', 'City', 'Region', 'Category', 'Sub-Category',
       'Sales', 'Quantity', 'if_profit'],
      dtype='object')


In [17]:
cat_cols = ['Ship Mode','Segment', 'City', 'Region','Category','Sub-Category','if_profit']
num_cols = ['Quantity','Sales']

In [18]:
cat_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
    ]
)

num_transformer = Pipeline(
  steps=[
      ('imputer', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())
  ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
  ],
     remainder='drop'
)

In [20]:
from sklearn.model_selection import RandomizedSearchCV 
rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]
)

param_dist = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5,7,10]
}

rscv_rf = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_dist,
    n_iter=10,       # Number of parameter settings that are sampled
    cv=5,            # Number of cross-validation folds
    verbose=1,       # Output progress
    random_state=42,
    n_jobs=-1,       # Use all available CPU cores
    scoring='neg_mean_squared_error' # Optimize for minimizing MSE (negative is used as convention)
)

print("fitting model..")
rscv_rf.fit(X_train, y_train)
print("Best Parameters=", rscv_rf.best_params_)

fitting model..
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters= {'model__n_estimators': 100, 'model__max_depth': 10}


In [22]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    rscv_rf,
    X,
    y,
    cv=5,
    scoring='r2'
)

print("CV R² Scores:", cv_scores)
print("Average CV R²:", cv_scores.mean())



Fitting 5 folds for each of 9 candidates, totalling 45 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits




CV R² Scores: [0.6264175  0.58955822 0.6218591  0.5706031  0.62135197]
Average CV R²: 0.6059579788871273




In [23]:
import joblib
joblib.dump(rscv_rf,'model.pkl')
print("model saved!")

model saved!
