In [20]:
%matplotlib inline
import os
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None 

## Step 1.1: Loading the data, including testing/training split from Project 1
Loading ecommerce data from kaggle (https://www.kaggle.com/prachi13/customer-analytics)

Addressing any missing data issues.

In [8]:
folder = "https://raw.githubusercontent.com/beyenidogan/AdvData/main/Project3/"
file_name="ecommerce_shipping.csv"
file_url=folder+file_name

shipping = pd.read_csv(file_url,error_bad_lines=False)


In [9]:
shipping.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN
0,1,D,Flight,4,2,177,3,low,F,44,1233.0,1
1,2,F,Flight,4,5,216,2,low,M,59,3088.0,1
2,3,A,Flight,2,2,183,4,low,M,48,3374.0,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177.0,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484.0,1


In [10]:
shipping.shape

(10999, 12)

In [11]:
shipping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10999 non-null  int64  
 1   Warehouse_block      10999 non-null  object 
 2   Mode_of_Shipment     10999 non-null  object 
 3   Customer_care_calls  10999 non-null  int64  
 4   Customer_rating      10999 non-null  int64  
 5   Cost_of_the_Product  10999 non-null  int64  
 6   Prior_purchases      10999 non-null  int64  
 7   Product_importance   10999 non-null  object 
 8   Gender               10999 non-null  object 
 9   Discount_offered     10999 non-null  int64  
 10  Weight_in_gms        9898 non-null   float64
 11  ReachedonTime_YN     10999 non-null  int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 1.0+ MB


In [12]:
shipping.describe()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,ReachedonTime_YN
count,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,9898.0,10999.0
mean,5500.0,4.054459,2.990545,210.196836,3.567597,13.373216,3636.579612,0.596691
std,3175.28214,1.14149,1.413603,48.063272,1.52286,16.205527,1636.169391,0.490584
min,1.0,2.0,1.0,96.0,2.0,1.0,1001.0,0.0
25%,2750.5,3.0,2.0,169.0,3.0,4.0,1842.0,0.0
50%,5500.0,4.0,3.0,214.0,3.0,7.0,4150.0,1.0
75%,8249.5,5.0,4.0,251.0,4.0,10.0,5051.0,1.0
max,10999.0,7.0,5.0,310.0,10.0,65.0,7846.0,1.0


## Step 1.2: Preparing the dataset
Making sure that all the appropriate variables are converted to categorical variables (as ordinal or one hot) and any necessary feature scaling is done

In [13]:
shipping['Product_importance_S'] = shipping['Product_importance'].replace(['low','medium','high'],[1,2,3])
shipping.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN,Product_importance_S
0,1,D,Flight,4,2,177,3,low,F,44,1233.0,1,1
1,2,F,Flight,4,5,216,2,low,M,59,3088.0,1,1
2,3,A,Flight,2,2,183,4,low,M,48,3374.0,1,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177.0,1,2
4,5,C,Flight,2,2,184,3,medium,F,46,2484.0,1,2


In [14]:
shipping = shipping.drop('Product_importance', axis=1)
shipping.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN,Product_importance_S
0,1,D,Flight,4,2,177,3,F,44,1233.0,1,1
1,2,F,Flight,4,5,216,2,M,59,3088.0,1,1
2,3,A,Flight,2,2,183,4,M,48,3374.0,1,1
3,4,B,Flight,3,3,176,4,M,10,1177.0,1,2
4,5,C,Flight,2,2,184,3,F,46,2484.0,1,2


In [15]:
shipping.shape

(10999, 12)

In [16]:
y = shipping["ReachedonTime_YN"]
y.value_counts()

1    6563
0    4436
Name: ReachedonTime_YN, dtype: int64

In [17]:
X = shipping.drop("ReachedonTime_YN", axis=1)
X.shape

(10999, 11)

In [18]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,random_state=0)

In [24]:
# Scaling numeric variables and one-hot encoding the categorical variables
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_features = ['Customer_care_calls','Cost_of_the_Product','Prior_purchases','Discount_offered','Weight_in_gms','Customer_rating','Product_importance_S']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Warehouse_block','Mode_of_Shipment', 'Gender']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
     
X_train_encoded = preprocessor.fit_transform(X_train)

print(X_train_encoded)

[[-0.05198298 -0.37514254 -0.37098581 ...  0.          1.
   0.        ]
 [-0.05198298  1.28591748 -1.02633698 ...  1.          1.
   0.        ]
 [-0.92990676 -1.66246406  0.93971653 ...  0.          1.
   0.        ]
 ...
 [ 0.82594079 -0.79040755  0.28436536 ...  0.          1.
   0.        ]
 [ 0.82594079  0.45538747  0.93971653 ...  1.          0.
   1.        ]
 [ 0.82594079  0.97446873  0.93971653 ...  1.          0.
   1.        ]]


In [25]:
#Repeating the same transformation for the test dataset
X_test_encoded = preprocessor.fit_transform(X_test)

print(X_test_encoded)

[[-0.03081027  0.11093739 -1.04195107 ...  1.          1.
   0.        ]
 [-0.03081027 -0.95932705 -0.37982156 ...  1.          0.
   1.        ]
 [ 0.83819741 -0.24581742  0.28230795 ...  1.          1.
   0.        ]
 ...
 [ 0.83819741  0.6565624  -0.37982156 ...  1.          1.
   0.        ]
 [ 1.70720509  1.72682684  0.28230795 ...  0.          1.
   0.        ]
 [-0.03081027 -0.5396155   4.255085   ...  0.          1.
   0.        ]]


## Step 2: PCA for Feature Selection

In [29]:
from sklearn.decomposition import PCA
# keep the first two principal components of the data
pca1 = PCA(n_components=2)
# fit PCA model to data
pca1.fit(X_train_encoded)

# transform data onto the first two principal components
X_pca1 = pca1.transform(X_train_encoded)

# plot first vs. second principal component, colored by class
plt.figure(figsize=(8, 8))
mglearn.discrete_scatter(X_pca1[:, 0], X_pca1[:, 1], y_train)
plt.legend(wine.target_names, loc="best")
plt.gca().set_aspect("equal")
plt.xlabel("First principal component")
plt.ylabel("Second principal component")

NameError: name 'mglearn' is not defined

<Figure size 576x576 with 0 Axes>