##Performing Classification on the dataset
####Predicting the size of the package of the shipped items 
####Using a Stochastic Gradient Descent Classifier 


In [None]:
!pip install pycaret 
import pandas as pd 
from sklearn.linear_model import SGDClassifier 
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

Collecting pycaret
  Downloading pycaret-2.3.3-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 16.4 MB/s eta 0:00:01[K     |██▌                             | 20 kB 21.9 MB/s eta 0:00:01[K     |███▊                            | 30 kB 13.4 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.5 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 5.4 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 6.0 MB/s eta 0:00:01[K     |████████▊                       | 71 kB 5.8 MB/s eta 0:00:01[K     |██████████                      | 81 kB 6.4 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 4.8 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████▏               | 133 kB 5.2 MB/s eta 0:00:01[K

In [None]:
#loading the dataset into a pandas dataframe 
df = pd.read_excel("/content/sales_data_sample.xlsx",usecols=['QUANTITYORDERED','PRICEEACH','SALES','STATUS','PRODUCTLINE','DEALSIZE'])
print(df.shape)
df.head()

(2823, 6)


Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,STATUS,PRODUCTLINE,DEALSIZE
0,30,95.7,2871.0,Shipped,Motorcycles,Small
1,34,81.35,2765.9,Shipped,Motorcycles,Small
2,41,94.74,3884.34,Shipped,Motorcycles,Medium
3,45,83.26,3746.7,Shipped,Motorcycles,Medium
4,49,100.0,5205.27,Shipped,Motorcycles,Medium


In [None]:
is_Shipped = df['STATUS'] == 'Shipped'
filtered_df = df[is_Shipped]
filtered_df = filtered_df.reset_index()
print(filtered_df.shape)
filtered_df.head()

(2617, 7)


Unnamed: 0,index,QUANTITYORDERED,PRICEEACH,SALES,STATUS,PRODUCTLINE,DEALSIZE
0,0,30,95.7,2871.0,Shipped,Motorcycles,Small
1,1,34,81.35,2765.9,Shipped,Motorcycles,Small
2,2,41,94.74,3884.34,Shipped,Motorcycles,Medium
3,3,45,83.26,3746.7,Shipped,Motorcycles,Medium
4,4,49,100.0,5205.27,Shipped,Motorcycles,Medium


In [None]:
#QUANTITYORDERED, PRICEEACH, SALES, PRODUCTLINE (ENCODED) as input 
#DEALSIZE as output 
#let's find the unique values of the categorical column PRODUCTLINE and covert them to numerical representations by using label encoding 
print(filtered_df.PRODUCTLINE.unique())
filtered_df['PRODUCTLINE'] = filtered_df['PRODUCTLINE'].astype('category')
filtered_df['PRODUCTLINE_ENCODED'] = filtered_df['PRODUCTLINE'].cat.codes
filtered_df['DEALSIZE'] = filtered_df['DEALSIZE'].astype('category')
filtered_df['DEALSIZE_ENCODED'] = filtered_df['DEALSIZE'].cat.codes
print(filtered_df.PRODUCTLINE_ENCODED.unique())
#and dropping the original PRODUCTLINE column 
filtered_df = filtered_df.drop('PRODUCTLINE',axis=1)
filtered_df = filtered_df.drop('DEALSIZE',axis=1)
filtered_df.head()

['Motorcycles' 'Classic Cars' 'Trucks and Buses' 'Vintage Cars' 'Planes'
 'Ships' 'Trains']
[1 0 5 6 2 3 4]


Unnamed: 0,index,QUANTITYORDERED,PRICEEACH,SALES,STATUS,PRODUCTLINE_ENCODED,DEALSIZE_ENCODED
0,0,30,95.7,2871.0,Shipped,1,2
1,1,34,81.35,2765.9,Shipped,1,2
2,2,41,94.74,3884.34,Shipped,1,1
3,3,45,83.26,3746.7,Shipped,1,1
4,4,49,100.0,5205.27,Shipped,1,1


In [None]:
#split the data into x and y 
X = filtered_df[['QUANTITYORDERED','PRICEEACH','SALES','PRODUCTLINE_ENCODED']]
print(X)
y = filtered_df[['DEALSIZE_ENCODED']]
print(y)

      QUANTITYORDERED  PRICEEACH    SALES  PRODUCTLINE_ENCODED
0                  30      95.70  2871.00                    1
1                  34      81.35  2765.90                    1
2                  41      94.74  3884.34                    1
3                  45      83.26  3746.70                    1
4                  49     100.00  5205.27                    1
...               ...        ...      ...                  ...
2612               40      55.69  2227.60                    3
2613               42      97.16  4080.72                    3
2614               20     100.00  2244.40                    3
2615               29     100.00  3978.51                    3
2616               34      62.24  2116.16                    3

[2617 rows x 4 columns]
      DEALSIZE_ENCODED
0                    2
1                    2
2                    1
3                    1
4                    1
...                ...
2612                 2
2613                 1
2614        

In [None]:
#train test split, 80% training 20% testing , applying randomness to ensure reproducibility 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2093, 4)
(524, 4)
(2093, 1)
(524, 1)


In [None]:
clf = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000,tol=1e-3))

clf.fit(X_train,y_train.values.ravel())

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier())])

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test))

0.9599236641221374


###97% accuracy in predicting the correct package size 


#Let's use PyCaret to see the performance metrics of the different classifiers that we can use. 

In [None]:
from pycaret.classification import *

In [None]:
df = pd.read_excel("/content/sales_data_sample.xlsx",usecols=['QUANTITYORDERED','PRICEEACH','SALES','STATUS','PRODUCTLINE','DEALSIZE'])
is_Shipped = df['STATUS'] == 'Shipped'
filtered_df = df[is_Shipped]
filtered_df = filtered_df.reset_index()
filtered_df['PRODUCTLINE'] = filtered_df['PRODUCTLINE'].astype('category')
filtered_df['PRODUCTLINE_ENCODED'] = filtered_df['PRODUCTLINE'].cat.codes
filtered_df['DEALSIZE'] = filtered_df['DEALSIZE'].astype('category')
filtered_df['DEALSIZE_ENCODED'] = filtered_df['DEALSIZE'].cat.codes
filtered_df = filtered_df.drop('PRODUCTLINE',axis=1)
filtered_df = filtered_df.drop('DEALSIZE',axis=1)


In [None]:
data = filtered_df.sample(frac=0.80,random_state=786)
data_unseen = filtered_df.drop(data.index)
data.reset_index(inplace=True,drop=True)
data_unseen.reset_index(inplace=True,drop=True)
print('Data for modeling: '+str(data.shape))
print('Unseen data for predictions'+str(data_unseen.shape))

Data for modeling: (2094, 7)
Unseen data for predictions(523, 7)


In [None]:
exp_clf = setup(data=data,target='DEALSIZE_ENCODED',session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,DEALSIZE_ENCODED
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(2094, 7)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.017
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.132
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.619
lightgbm,Light Gradient Boosting Machine,0.9966,1.0,0.9939,0.9967,0.9966,0.9938,0.9938,0.173
knn,K Neighbors Classifier,0.9918,0.9991,0.9869,0.9919,0.9918,0.9851,0.9851,0.121
et,Extra Trees Classifier,0.9884,0.9996,0.969,0.9885,0.9882,0.9787,0.9789,0.486
lr,Logistic Regression,0.985,0.999,0.9895,0.9852,0.985,0.9726,0.9728,0.923
lda,Linear Discriminant Analysis,0.9542,0.994,0.957,0.9561,0.9544,0.9172,0.9178,0.035
nb,Naive Bayes,0.8969,0.9798,0.9245,0.9156,0.9017,0.8189,0.8223,0.018
ridge,Ridge Classifier,0.8676,0.0,0.6957,0.8846,0.8594,0.7503,0.7625,0.017
