In [1]:
# import library
import pandas as pd
import numpy as np
# read the excel file.
dataset = pd.read_excel('inventory1.xlsx')

In [2]:
dataset.head() # display first few lines

Unnamed: 0,Order Date,Order ID,Order Quantity,Product Container,Product Name,Product Sub-Category,Sales,Ship Mode
0,1/27/2007,24544,31,Medium Box,Canon MP41DH Printing Calculator,Office Machines,6567,Express Air
1,1/27/2007,24544,39,Large Box,Fellowes Neat Ideas® Storage Cubes,Storage & Organization,1780,Regular Air
2,1/27/2007,24544,15,Jumbo Drum,"Global Stack Chair without Arms, Black",Chairs & Chairmats,578,Delivery Truck
3,1/27/2007,20422,30,Small Pack,Nu-Dell Leatherette Frames,Office Furnishings,611,Regular Air
4,1/27/2007,55937,10,Small Box,"IBM Active Response Keyboard, Black",Computer Peripherals,517,Regular Air


- Here,we can observe that,All columns are not important for prediction of shipping Mode,eg. Order Date,Product Name,Order ID are of no use for prediction.
- How to decide,which column is of no use?
        1. Apply common sense. Does product Name,Category matter while shipping? or order id or order Date matters?
        2. Only things that matter is quantity ,Container and Sales to decide which shipping mode to use.
        3. Which columns to choose comes from experiance, after solving few questions.
- Hence,ignore unnecessary columns and continue with the important one.

In [3]:
# considering only important columns for prediction
dataset = dataset[['Order Quantity','Product Container','Sales','Ship Mode']]
dataset.head()

Unnamed: 0,Order Quantity,Product Container,Sales,Ship Mode
0,31,Medium Box,6567,Express Air
1,39,Large Box,1780,Regular Air
2,15,Jumbo Drum,578,Delivery Truck
3,30,Small Pack,611,Regular Air
4,10,Small Box,517,Regular Air


- Here,Product Container and Ship Mode is categorical variable,i.e. their values are in the form of string or label or text.We cannot apply mathematical operation on it. we need to convert it to integer encoding or one hot encoding
- Generally, a fixed set of values are used for categorical variable
- For example, 'Product Container' column contains only fixed set of 7 value repeated many times as shown in below cell.

In [4]:
dataset['Product Container'].value_counts()

Small Box     4081
Wrap Bag      1093
Small Pack     894
Jumbo Drum     573
Jumbo Box      490
Large Box      378
Medium Box     344
Name: Product Container, dtype: int64

In [5]:
dataset = dataset.dropna() # drop the missing values if any
dataset

Unnamed: 0,Order Quantity,Product Container,Sales,Ship Mode
0,31,Medium Box,6567,Express Air
1,39,Large Box,1780,Regular Air
2,15,Jumbo Drum,578,Delivery Truck
3,30,Small Pack,611,Regular Air
4,10,Small Box,517,Regular Air
...,...,...,...,...
7848,37,Wrap Bag,706,Express Air
7849,35,Small Box,1266,Express Air
7850,10,Wrap Bag,31,Regular Air
7851,6,Jumbo Drum,1105,Delivery Truck


In [6]:
# X is first three column , in other words ,all the column except the last one
# y is the last column
X= dataset.iloc[:,:-1].values 
y=dataset.iloc[:,-1].values 

# one hot encoding for categorical column -> Product Container
- it will create 7 different columns for each label:
        Small Box   
        Wrap Bag      
        Small Pack     
        Jumbo Drum     
        Jumbo Box      
        Large Box      
        Medium Box   
- and make the value Hot (i.e. 1) for the respective rows as shown in the below image and remaining as 0
- Reading: https://towardsdatascience.com/columntransformer-in-scikit-for-labelencoding-and-onehotencoding-in-machine-learning-c6255952731b

<img src="class.png">

In [7]:
# library for one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# [1] represent column number which you want to encode
# Product Container is the 1st column (index starts from zero) in the dataset 
transform = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(transform.fit_transform(X), dtype = np.str)

In [8]:
# extra code # 1
# Just to show you the new column name and its data
column = [i.split('_')[3] if '_' in i else i for i in transform.get_feature_names()]
column

['Jumbo Box',
 'Jumbo Drum',
 'Large Box',
 'Medium Box',
 'Small Box',
 'Small Pack',
 'Wrap Bag',
 'x0',
 'x2']

In [9]:
# extra code # 2 ,see how new encoded data looks like 
pd.DataFrame(X,columns=column)

Unnamed: 0,Jumbo Box,Jumbo Drum,Large Box,Medium Box,Small Box,Small Pack,Wrap Bag,x0,x2
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,31,6567
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,39,1780
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,15,578
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,30,611
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10,517
...,...,...,...,...,...,...,...,...,...
7848,0.0,0.0,0.0,0.0,0.0,0.0,1.0,37,706
7849,0.0,0.0,0.0,0.0,1.0,0.0,0.0,35,1266
7850,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10,31
7851,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6,1105


In [10]:
# target label is also categorical data
# it contains Regular Air , Delivery Truck and Express Air
dataset['Ship Mode'].value_counts() 

Regular Air       5869
Delivery Truck    1063
Express Air        921
Name: Ship Mode, dtype: int64

In [11]:
y # target

array(['Express Air', 'Regular Air', 'Delivery Truck', ..., 'Regular Air',
       'Delivery Truck', 'Regular Air'], dtype=object)

In [12]:
# Hence ,convert target label y using label encoder
# it will convert it into 0,1 and 2 
from sklearn.preprocessing import LabelEncoder
encoder_y = LabelEncoder()
y = encoder_y.fit_transform(y)

In [13]:
y

array([1, 2, 0, ..., 2, 0, 2])

- 0 for Delivery Truck
- 1 for Express Air
- 2 for Regular Air

In [14]:
# Extra code #3 , for your understanding
for i in range(len(encoder_y.classes_)):
    print(i,'->',encoder_y.classes_[i])

0 -> Delivery Truck
1 -> Express Air
2 -> Regular Air


In [15]:
# divide the data into training and testing set
# 70% for training and 30% for testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [16]:
# Standard feature Scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [17]:
# build the LogisticRegression model with max_iter=150 
# if you use the default max_iter i.e 100 ,the program will give warning 
# because of large datasize.
from sklearn.linear_model import LogisticRegression
Logistic_classifier = LogisticRegression(max_iter=150,random_state = 42)
Logistic_classifier.fit(X_train, y_train)

LogisticRegression(max_iter=150, random_state=42)

In [18]:
y_pred = Logistic_classifier.predict(X_test) # predict on testset

In [19]:
# get the confusion matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)

In [20]:
cm

array([[ 322,    0,    0],
       [   0,    0,  272],
       [   0,    0, 1762]], dtype=int64)

In [21]:
accuracy_score(y_test, y_pred) # get the accuracy

0.8845500848896435

In [23]:
df=pd.DataFrame({'y_test':encoder_y.inverse_transform(y_test),'y_pred':encoder_y.inverse_transform(y_pred)})

In [24]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [25]:
df

Unnamed: 0,y_test,y_pred
0,Regular Air,Regular Air
1,Delivery Truck,Delivery Truck
2,Regular Air,Regular Air
3,Express Air,Regular Air
4,Regular Air,Regular Air
...,...,...
2351,Delivery Truck,Delivery Truck
2352,Regular Air,Regular Air
2353,Regular Air,Regular Air
2354,Regular Air,Regular Air


In [26]:
df[df['y_test']!=df['y_pred']]

Unnamed: 0,y_test,y_pred
3,Express Air,Regular Air
10,Express Air,Regular Air
15,Express Air,Regular Air
43,Express Air,Regular Air
69,Express Air,Regular Air
74,Express Air,Regular Air
75,Express Air,Regular Air
77,Express Air,Regular Air
78,Express Air,Regular Air
89,Express Air,Regular Air
