In [164]:
import os
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

## Step 1: Find and load the selected dataset for supervised learning. 
Loading ecommerce data from kaggle (https://www.kaggle.com/prachi13/customer-analytics)


In [179]:
folder = "https://raw.githubusercontent.com/beyenidogan/AdvData/main/Project1/"
file_name="ecommerce_shipping.csv"
file_url=folder+file_name

shipping = pd.read_csv(file_url,error_bad_lines=False)


In [180]:
shipping.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN
0,1,D,Flight,4.0,2,177.0,3,low,F,44,1233.0,1
1,2,F,Flight,4.0,5,216.0,2,low,M,59,3088.0,1
2,3,A,Flight,2.0,2,,4,low,M,48,3374.0,1
3,4,B,Flight,3.0,3,,4,medium,M,10,1177.0,1
4,5,C,Flight,,2,184.0,3,medium,F,46,2484.0,1


## Step 2: Explore the dataset:

In [181]:
shipping.shape

(10999, 12)

Exploring the data using the pandas functions info() and describe(). Note that the describe() function is only summarizing the numeric variables.
In order to understand the categorical variables value_counts() will be used. Note that the attributes Customer_Rating and ReachedonTime_YN are described as a number, but also will be included as categorical.

In [182]:
shipping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10999 non-null  int64  
 1   Warehouse_block      10999 non-null  object 
 2   Mode_of_Shipment     10999 non-null  object 
 3   Customer_care_calls  9889 non-null   float64
 4   Customer_rating      10999 non-null  int64  
 5   Cost_of_the_Product  9947 non-null   float64
 6   Prior_purchases      10999 non-null  int64  
 7   Product_importance   10999 non-null  object 
 8   Gender               10999 non-null  object 
 9   Discount_offered     10999 non-null  int64  
 10  Weight_in_gms        9898 non-null   float64
 11  ReachedonTime_YN     10999 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 1.0+ MB


In [183]:
shipping.describe()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,ReachedonTime_YN
count,10999.0,9889.0,10999.0,9947.0,10999.0,10999.0,9898.0,10999.0
mean,5500.0,4.055213,2.990545,210.1133,3.567597,13.373216,3636.579612,0.596691
std,3175.28214,1.137665,1.413603,48.158084,1.52286,16.205527,1636.169391,0.490584
min,1.0,2.0,1.0,96.0,2.0,1.0,1001.0,0.0
25%,2750.5,3.0,2.0,169.0,3.0,4.0,1842.0,0.0
50%,5500.0,4.0,3.0,214.0,3.0,7.0,4150.0,1.0
75%,8249.5,5.0,4.0,251.0,4.0,10.0,5051.0,1.0
max,10999.0,7.0,5.0,310.0,10.0,65.0,7846.0,1.0


In [184]:
categorical_features=['Warehouse_block','Mode_of_Shipment', 'Gender','Product_importance', 'ReachedonTime_YN']
shipping[categorical_features] = shipping[categorical_features].astype(str)

shipping[categorical_features].apply(pd.Series.value_counts)

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Gender,Product_importance,ReachedonTime_YN
0,,,,,4436.0
1,,,,,6563.0
A,1833.0,,,,
B,1833.0,,,,
C,1833.0,,,,
D,1834.0,,,,
F,3666.0,,5545.0,,
Flight,,1777.0,,,
M,,,5454.0,,
Road,,1760.0,,,


Although the above table is rather difficult to read, since all the values from 5 different columns are sorted in alphabetical order, it is helpful to see all the distributions by number in one view. 
Note that the output variable (Customer_rating) values are almost equally distributed (~2000 each).

The target feature will be Customer_rating, so separating the other attributes as features:

In [185]:
X = shipping.drop("Customer_rating", axis=1)
print(X.head())

   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  \
0   1               D           Flight                  4.0   
1   2               F           Flight                  4.0   
2   3               A           Flight                  2.0   
3   4               B           Flight                  3.0   
4   5               C           Flight                  NaN   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                177.0                3                low      F   
1                216.0                2                low      M   
2                  NaN                4                low      M   
3                  NaN                4             medium      M   
4                184.0                3             medium      F   

   Discount_offered  Weight_in_gms ReachedonTime_YN  
0                44         1233.0                1  
1                59         3088.0                1  
2                48         3374.0          

In [186]:
y = shipping["Customer_rating"]
print(y.head())

0    2
1    5
2    2
3    3
4    2
Name: Customer_rating, dtype: int64


## Step 3: Divide into a training set and a testing set. 
Using scikitlearn to divide the data into training and testing sets, making sure that the testing and training sets are balanced in terms of target classes.

In [187]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# split dataframe and income
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Step 4: Explore your training set. 

Exploring the train dataset using info() and describe() pandas functions.


In [188]:
X_train

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,ReachedonTime_YN
9537,9538,B,Ship,3.0,275.0,3,medium,M,7,4968.0,0
1832,1833,A,Ship,3.0,166.0,2,medium,F,58,1319.0,1
1687,1688,F,Ship,5.0,258.0,2,medium,F,48,3717.0,1
600,601,D,Ship,6.0,133.0,5,medium,M,32,1367.0,1
6740,6741,A,Ship,6.0,210.0,4,medium,M,9,,0
...,...,...,...,...,...,...,...,...,...,...,...
4859,4860,F,Ship,5.0,231.0,2,low,M,8,5743.0,1
3264,3265,D,Road,4.0,,2,low,M,2,4905.0,0
9845,9846,F,Road,4.0,161.0,3,medium,M,3,4590.0,0
10799,10800,F,Road,4.0,245.0,6,medium,M,2,1727.0,1


In [189]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8249 entries, 9537 to 2732
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   8249 non-null   int64  
 1   Warehouse_block      8249 non-null   object 
 2   Mode_of_Shipment     8249 non-null   object 
 3   Customer_care_calls  7391 non-null   float64
 4   Cost_of_the_Product  7461 non-null   float64
 5   Prior_purchases      8249 non-null   int64  
 6   Product_importance   8249 non-null   object 
 7   Gender               8249 non-null   object 
 8   Discount_offered     8249 non-null   int64  
 9   Weight_in_gms        7431 non-null   float64
 10  ReachedonTime_YN     8249 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 773.3+ KB


In [190]:
X_train.describe()

Unnamed: 0,ID,Customer_care_calls,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms
count,8249.0,7391.0,7461.0,8249.0,8249.0,7431.0
mean,5480.9743,4.059938,209.792655,3.573767,13.606134,3633.454044
std,3179.856598,1.135028,47.892955,1.510124,16.414134,1641.273883
min,1.0,2.0,96.0,2.0,1.0,1002.0
25%,2720.0,3.0,170.0,3.0,4.0,1839.5
50%,5487.0,4.0,213.0,3.0,7.0,4145.0
75%,8225.0,5.0,250.0,4.0,10.0,5065.0
max,10998.0,7.0,310.0,10.0,65.0,7846.0


The distribution (looking at the mean and std variation) of the numeric variables above are fairly similar to the full dataset.

In [191]:
X_train[categorical_features].apply(pd.Series.value_counts)

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Gender,Product_importance,ReachedonTime_YN
0,,,,,3301.0
1,,,,,4948.0
A,1365.0,,,,
B,1386.0,,,,
C,1338.0,,,,
D,1404.0,,,,
F,2756.0,,4187.0,,
Flight,,1326.0,,,
M,,,4062.0,,
Road,,1322.0,,,


In [192]:
y_train.value_counts()

1    1697
3    1684
4    1668
2    1602
5    1598
Name: Customer_rating, dtype: int64

Comparing the counts of target variable (Customer_rating) of the train dataset to the original dataset, it can be said that the split is fairly balanced (all categories are around ~1600 values vs ~2000 in the original dataset).
It is also the case for the other categorical variables, which have proportinal representation in each category values.

## Step 4: Data cleaning. 
Address any missing values in the training set, creating a second, cleaned, version of your dataset. Then applying the same procedure to the test set.

The some options for handling the null values are:
“Get rid of the corresponding samples.”
“Get rid of the whole attribute (column).”
“Set the values to some value (zero, the mean, the median, etc.).”

Note that looking at the info of the full dataset, we already know that the Customer_care_calls, Cost_of_the_Product and Weight_in_gms have less number of non-null values, thus might require more attention.

In [195]:
X_train['ID'].isnull().values.any()

False

In [196]:
X_train['Warehouse_block'].isnull().values.any()

False

In [197]:
X_train['Mode_of_Shipment'].isnull().values.any()

False

In [198]:
X_train['Customer_care_calls'].isnull().values.any()

True

In [200]:
X_train['Cost_of_the_Product'].isnull().values.any()

True

In [201]:
X_train['Prior_purchases'].isnull().values.any()

False

In [202]:
X_train['Product_importance'].isnull().values.any()

False

In [203]:
X_train['Gender'].isnull().values.any()

False

In [204]:
X_train['Discount_offered'].isnull().values.any()

False

In [205]:
X_train['Weight_in_gms'].isnull().values.any()

True

In [206]:
X_train['ReachedonTime_YN'].isnull().values.any()

False

In [208]:
X_test['Warehouse_block'].isnull().values.any()

False

In [209]:
X_test['Mode_of_Shipment'].isnull().values.any()

False

In [210]:
X_test['Customer_care_calls'].isnull().values.any()

True

In [211]:
X_test['Cost_of_the_Product'].isnull().values.any()

True

In [212]:
X_test['Prior_purchases'].isnull().values.any()

False

In [213]:
X_test['Product_importance'].isnull().values.any()

False

In [214]:
X_test['Gender'].isnull().values.any()

False

In [215]:
X_test['Discount_offered'].isnull().values.any()

False

In [216]:
X_train['Weight_in_gms'].isnull().values.any()

True

In [217]:
X_train['ReachedonTime_YN'].isnull().values.any()

False

In [218]:
X_test['Warehouse_block'].isnull().values.any()

False

In [145]:
print(shipping)

          ID Warehouse_block Mode_of_Shipment  Customer_care_calls  \
0          1               D           Flight                    4   
1          2               F           Flight                    4   
2          3               A           Flight                    2   
3          4               B           Flight                    3   
4          5               C           Flight                    2   
...      ...             ...              ...                  ...   
10994  10995               A             Ship                    4   
10995  10996               B             Ship                    4   
10996  10997               C             Ship                    5   
10997  10998               F             Ship                    5   
10998  10999               D             Ship                    2   

       Customer_rating  Cost_of_the_Product  Prior_purchases  \
0                    2                  177                3   
1                    5         

Step 5: Visualize the data in your training set. At a minimum, use the following pandas functions to visualize the data in your Jupyter notebook.

DataFrame.hist
plotting.scatter_matrix()

Step 6: Apply transformations to your data. In your Jupetyr notebook apply, squaring, cubing, logarithmic, and exponentials transformations to two features in your dataset. Plot the histograms and scatter matrices of the resultant data.