In [11]:
import os
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

pd.options.mode.chained_assignment = None 

## Step 1: Load your data, including testing/training split from Project 1
Loading ecommerce data from kaggle (https://www.kaggle.com/prachi13/customer-analytics)

Addressing any missing data issues.

In [21]:
folder = "https://raw.githubusercontent.com/beyenidogan/AdvData/main/Project2/"
file_name="ecommerce_shipping.csv"
file_url=folder+file_name

shipping = pd.read_csv(file_url,error_bad_lines=False)


In [22]:
shipping.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN
0,1,D,Flight,4,2,177,3,low,F,44,1233.0,1
1,2,F,Flight,4,5,216,2,low,M,59,3088.0,1
2,3,A,Flight,2,2,183,4,low,M,48,3374.0,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177.0,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484.0,1


In [23]:
shipping.shape

(10999, 12)

In [24]:
shipping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10999 non-null  int64  
 1   Warehouse_block      10999 non-null  object 
 2   Mode_of_Shipment     10999 non-null  object 
 3   Customer_care_calls  10999 non-null  int64  
 4   Customer_rating      10999 non-null  int64  
 5   Cost_of_the_Product  10999 non-null  int64  
 6   Prior_purchases      10999 non-null  int64  
 7   Product_importance   10999 non-null  object 
 8   Gender               10999 non-null  object 
 9   Discount_offered     10999 non-null  int64  
 10  Weight_in_gms        9898 non-null   float64
 11  ReachedonTime_YN     10999 non-null  int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 1.0+ MB


In [25]:
shipping.describe()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,ReachedonTime_YN
count,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,9898.0,10999.0
mean,5500.0,4.054459,2.990545,210.196836,3.567597,13.373216,3636.579612,0.596691
std,3175.28214,1.14149,1.413603,48.063272,1.52286,16.205527,1636.169391,0.490584
min,1.0,2.0,1.0,96.0,2.0,1.0,1001.0,0.0
25%,2750.5,3.0,2.0,169.0,3.0,4.0,1842.0,0.0
50%,5500.0,4.0,3.0,214.0,3.0,7.0,4150.0,1.0
75%,8249.5,5.0,4.0,251.0,4.0,10.0,5051.0,1.0
max,10999.0,7.0,5.0,310.0,10.0,65.0,7846.0,1.0


From Project 1, we knew that only the Weight_in_gms column had any missing values, so addressing the missing records by removing them:

In [26]:
shipping.isnull().values.any()

True

In [27]:
shipping['Weight_in_gms'].isnull().values.any()

True

In [28]:
#checking how many elements are null
shipping['Weight_in_gms'].isnull().values.sum()

1101

In [20]:
#dropping the NA rows
shipping.dropna(subset=['Weight_in_gms'])

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN
0,1,D,Flight,4,2,177,3,low,F,44,1233.0,1
1,2,F,Flight,4,5,216,2,low,M,59,3088.0,1
2,3,A,Flight,2,2,183,4,low,M,48,3374.0,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177.0,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10993,10994,F,Ship,5,1,257,6,low,F,1,1150.0,0
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538.0,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247.0,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155.0,0


## Step 2: Preparing the dataset
Making sure that all the appropriate variables are converted to categorical variables (as ordinal or one hot) and any necessary feature scaling is done

In [8]:
categorical_features=['Warehouse_block','Mode_of_Shipment', 'Gender','Product_importance', 'ReachedonTime_YN']
shipping[categorical_features] = shipping[categorical_features].astype(str)

shipping[categorical_features].apply(pd.Series.value_counts)

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Gender,Product_importance,ReachedonTime_YN
0,,,,,4436.0
1,,,,,6563.0
A,1833.0,,,,
B,1833.0,,,,
C,1833.0,,,,
D,1834.0,,,,
F,3666.0,,5545.0,,
Flight,,1777.0,,,
M,,,5454.0,,
Road,,1760.0,,,


## Step 3: Examining the target attribute. 

Examining the attribute you are going to predict.

Plotting the distribution of the target attribute in the dataset (e.g., is it Gaussian, uniform, logarithmic). This will help in interpreting the performance of different algorithms on your data.

In [29]:
y = shipping["Customer_rating"]
print(shipping["Customer_rating"].value_counts())

3    2239
1    2235
4    2189
5    2171
2    2165
Name: Customer_rating, dtype: int64


In [10]:
X = shipping.drop("ReachedonTime_YN", axis=1)
print(X.head())

   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0   1               D           Flight                    4                2   
1   2               F           Flight                    4                5   
2   3               A           Flight                    2                2   
3   4               B           Flight                    3                3   
4   5               C           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   
3                  176                4             medium      M   
4                  184                3             medium      F   

   Discount_offered  Weight_in_gms  
0                44         1233.0  
1                59         30

In [527]:
# splitting
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# split dataframe and income
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Step 6: Applying transformations to the datasets
Applying transformations to the features in the dataset and then plotting the histograms and scatter matrices of the resultant data.

In [577]:
#applying scaler for all attributes
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [578]:
print(X_train2.var())

ID                     1.011149e+07
Customer_care_calls    1.302435e+00
Cost_of_the_Product    2.289624e+03
Prior_purchases        2.280476e+00
Discount_offered       2.694238e+02
Weight_in_gms          2.693780e+06
dtype: float64


In [579]:
X_train2_subset= X_train2[['Customer_care_calls', 'Cost_of_the_Product', 'Prior_purchases','Discount_offered','Weight_in_gms']]

In [589]:
X_train2scaled=pd.DataFrame(scaler.fit_transform(X_train2_subset))
X_train2scaled.columns=X_train2_subset.columns
print(X_train2scaled.var())

Customer_care_calls    1.000121
Cost_of_the_Product    1.000121
Prior_purchases        1.000121
Discount_offered       1.000121
Weight_in_gms          1.000135
dtype: float64


In [583]:
#repeating the same procedure on the test dataset
print(X_test2.var())


ID                     9.994515e+06
Customer_care_calls    1.304938e+00
Cost_of_the_Product    2.371291e+03
Prior_purchases        2.435385e+00
Discount_offered       2.416470e+02
Weight_in_gms          2.627612e+06
dtype: float64


In [584]:
X_test2_subset= X_test2[['Customer_care_calls', 'Cost_of_the_Product', 'Prior_purchases','Discount_offered','Weight_in_gms']]

In [590]:
X_test2scaled=pd.DataFrame(scaler.fit_transform(X_test2_subset))
X_test2scaled.columns=X_test2_subset.columns
print(X_test2scaled.var())

Customer_care_calls    1.000364
Cost_of_the_Product    1.000364
Prior_purchases        1.000364
Discount_offered       1.000364
Weight_in_gms          1.000406
dtype: float64
