In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('amazon_sales_data 2025.csv')
data.head()

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Order ID           250 non-null    object
 1   Date               250 non-null    object
 2   Product            250 non-null    object
 3   Category           250 non-null    object
 4   Price              250 non-null    int64 
 5   Quantity           250 non-null    int64 
 6   Total Sales        250 non-null    int64 
 7   Customer Name      250 non-null    object
 8   Customer Location  250 non-null    object
 9   Payment Method     250 non-null    object
 10  Status             250 non-null    object
dtypes: int64(3), object(8)
memory usage: 21.6+ KB


In [4]:
def change_to_date(column):
    data[column] = pd.to_datetime(data[column],infer_datetime_format=True)
    

In [5]:
for i in ['Date']:
    change_to_date(i)

In [6]:
data.dtypes

Order ID                     object
Date                 datetime64[ns]
Product                      object
Category                     object
Price                         int64
Quantity                      int64
Total Sales                   int64
Customer Name                object
Customer Location            object
Payment Method               object
Status                       object
dtype: object

In [7]:
data['date'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

In [8]:
data.head()

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status,date,Month,Year
0,ORD0001,2025-03-14,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled,14,3,2025
1,ORD0002,2025-03-20,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending,20,3,2025
2,ORD0003,2025-02-15,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled,15,2,2025
3,ORD0004,2025-02-19,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending,19,2,2025
4,ORD0005,2025-10-03,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending,3,10,2025


In [9]:
data.drop(['Order ID','Date'],axis=1,inplace=True)

In [10]:
data.head()

Unnamed: 0,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status,date,Month,Year
0,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled,14,3,2025
1,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending,20,3,2025
2,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled,15,2,2025
3,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending,19,2,2025
4,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending,3,10,2025


In [11]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
le = LabelEncoder()
ss = StandardScaler()

In [12]:
cat = data.select_dtypes(include='object').columns
num = data.select_dtypes(include='int64').columns

In [13]:
for i in cat:
    data[i] = le.fit_transform(data[i])

In [14]:
data.head()

Unnamed: 0,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status,date,Month,Year
0,5,3,60,3,180,4,7,2,0,14,3,2025
1,1,2,100,4,400,3,8,2,2,20,3,2025
2,5,3,60,2,120,6,3,0,0,15,2,2025
3,5,3,60,3,180,8,2,1,2,19,2,2025
4,7,2,150,3,450,4,7,2,2,3,10,2025


In [15]:
data.dtypes

Product              int32
Category             int32
Price                int64
Quantity             int64
Total Sales          int64
Customer Name        int32
Customer Location    int32
Payment Method       int32
Status               int32
date                 int32
Month                int32
Year                 int32
dtype: object

In [16]:
X = data.drop(['Total Sales'],axis=1)
y = data['Total Sales']

In [17]:
feature_columns = X.columns.to_list()
feature_columns

['Product',
 'Category',
 'Price',
 'Quantity',
 'Customer Name',
 'Customer Location',
 'Payment Method',
 'Status',
 'date',
 'Month',
 'Year']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [19]:
preprocessor = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [20]:
models = {'DT':(DecisionTreeRegressor(),
                {
                 'model__max_depth':[3,5,None]   
                }),
         'RF':(RandomForestRegressor(),
                {
                 'model__max_depth':[5,None],
                 'model__n_estimators':[100,200]
                }),
          'XGB':(XGBRegressor(),
                {
                 'model__max_depth':[3,5],
                 'model__n_estimators':[100]
                })
         }

In [22]:
from sklearn.model_selection import GridSearchCV
best_model = None
best_score = 0

for name,(model,params) in models.items():
    pipe = Pipeline([
        ('prep',preprocessor),
        ('model',model)
    ])
    
    grid = GridSearchCV(pipe,param_grid=params,cv=5,scoring='r2')
    grid.fit(X_train,y_train)
    if grid.best_score_>best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_

In [23]:
best_score

0.9953497529029847

In [24]:
best_model

In [25]:
joblib.dump(best_model,'amazon_sales_2025.joblib')

['amazon_sales_2025.joblib']

In [26]:
joblib.dump(feature_columns,'amazon_sales.joblib')

['amazon_sales.joblib']

In [None]:
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('amazon_sales_2025.joblib')
features = joblib.load('amazon_sales.joblib')
st.title('Amazon Sales 2025')

user_input={}
for i in features:
    user_input[i] = st.number_input(i,value=0.0)

if st.button('Predict'):
    data = pd.DataFrame([user_input])
    pred = model.predict(data)[0]
    st.success(f'Prediction : {(pred)}')