# E-Commerce Shipping Data

### Product Shipment Delivered on time or not? To Meet E-Commerce Customer Demand

![image.png](../Images/Shipment.png)

e-commerce 배송의 정시 도착여부 ( 1 : 정시배송, 0 : 정시미배송 )를 판단하시오. <br>
단, 두 개 이상의 모형의 성능을 비교하여 가장 우수한 모형을 선정할 것

### Library & Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
X_train = pd.read_csv('../Datasets/Shipment_X_train.csv')
X_test = pd.read_csv('../Datasets/Shipment_X_test.csv')
y_train = pd.read_csv('../Datasets/Shipment_y_train.csv')

### 1. Data Exploration

In [3]:
X_train

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,6045,A,Flight,4,3,266,5,high,F,5,1590
1,44,F,Ship,3,1,174,2,low,M,44,1556
2,7940,F,Road,4,1,154,10,high,M,10,5674
3,1596,F,Ship,4,3,158,3,medium,F,27,1207
4,4395,A,Flight,5,3,175,3,low,M,7,4833
...,...,...,...,...,...,...,...,...,...,...,...
6593,8610,F,Road,5,2,221,6,medium,M,4,1952
6594,3390,F,Ship,4,5,256,3,medium,M,10,4504
6595,5150,F,Ship,3,1,217,4,medium,F,1,5761
6596,3950,F,Road,4,5,174,3,medium,F,8,5576


In [4]:
X_test

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,6811,D,Ship,5,2,259,5,low,F,7,1032
1,4320,F,Ship,3,5,133,3,medium,F,4,5902
2,5732,F,Road,3,4,191,5,medium,F,4,4243
3,7429,D,Ship,4,2,221,3,low,M,10,4126
4,2191,D,Flight,4,5,230,2,low,F,38,2890
...,...,...,...,...,...,...,...,...,...,...,...
4396,2610,F,Flight,4,1,157,3,medium,M,31,1712
4397,3406,B,Road,3,5,139,2,medium,M,7,5536
4398,10395,A,Road,4,1,170,10,medium,F,3,5211
4399,3646,B,Ship,3,1,244,3,medium,F,1,5695


In [5]:
y_train

Unnamed: 0,ID,Reached.on.Time_Y.N
0,6045,0
1,44,1
2,7940,1
3,1596,1
4,4395,1
...,...,...
6593,8610,1
6594,3390,1
6595,5150,1
6596,3950,0


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   6598 non-null   int64 
 1   Warehouse_block      6598 non-null   object
 2   Mode_of_Shipment     6598 non-null   object
 3   Customer_care_calls  6598 non-null   object
 4   Customer_rating      6598 non-null   int64 
 5   Cost_of_the_Product  6598 non-null   int64 
 6   Prior_purchases      6598 non-null   int64 
 7   Product_importance   6598 non-null   object
 8   Gender               6598 non-null   object
 9   Discount_offered     6598 non-null   int64 
 10  Weight_in_gms        6598 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 567.1+ KB


### 2. Data Preprocessing

#### (1) 상관관계가 낮은 변수 삭제

In [7]:
# ID 컬럼은 탑승자에 대한 고유 정보로 key 역할로 모델에는 불필요함
# 결과 제출 시에는 X_test의 ID 컬럼이 필요하기 때문에 별도 저장
ID = X_test['ID'].copy()

# 데이터들에서 ID 컬럼 삭제

# 또한, age, ticket는 survived와 상관관계가 낮으므로 컬럼을 삭제
X_train = X_train.drop(columns = ['ID'])
X_test = X_test.drop(columns = ['ID'])
y_train = y_train.drop(columns = ['ID'])

#### (2) Missing Value

결측값 없음

### 3. Data Modeling

#### (1) One-Hot Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

X_train_cat = X_train.select_dtypes('object').copy()
X_test_cat =  X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse=False)

ohe.fit(X_train_cat)

X_train_ohe = ohe.transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

#### (2) Scaling

In [9]:
from sklearn.preprocessing import MinMaxScaler

X_train_num = X_train.select_dtypes(exclude='object').copy()
X_test_num = X_test.select_dtypes(exclude='object').copy()

scaler = MinMaxScaler()

scaler.fit(X_train_num)

X_train_sca = scaler.transform(X_train_num)
X_test_sca = scaler.transform(X_test_num)

#### (3) Data Concat & Split

In [10]:
X_TRAIN = np.concatenate([X_train_ohe, X_train_sca], axis=1)
X_TEST = np.concatenate([X_test_ohe, X_test_sca], axis=1)

y_TRAIN = y_train['Reached.on.Time_Y.N']

print(type(X_TRAIN), type(X_TEST), type(y_TRAIN))
print(X_TRAIN.shape, X_TEST.shape, y_TRAIN.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
(6598, 24) (4401, 24) (6598,)


In [11]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X_TRAIN, y_TRAIN, test_size = 0.25, stratify=y_TRAIN, random_state=1234)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(4948, 24) (1650, 24) (4948,) (1650,)


 ### 4. Modeling

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [13]:
def make_models(xtrain, xtest, ytrain, ytest):
    model1 = LogisticRegression().fit(xtrain, ytrain)
    print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

    model2 = DecisionTreeClassifier(random_state=0).fit(xtrain, ytrain)
    print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model2 = DecisionTreeClassifier(max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

    model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
    print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model3 = RandomForestClassifier(500, max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

    model4 = XGBClassifier(eval_metric='logloss').fit(xtrain, ytrain)
    print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

### 5. Model Evaluation

In [14]:
from sklearn.metrics import roc_auc_score

def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    
    ypred = model.predict_proba(xtest)[:, 1]
    
    B = roc_auc_score(ytest, ypred)
    
    return f'{A:.4} {B:.4}'

In [15]:
make_models(xtrain, xtest, ytrain, ytest)

model1 0.6504 0.7234
model2 1.0 0.6365
model2 3 0.6841 0.7369
model2 4 0.6902 0.7402
model2 5 0.6975 0.7412
model2 6 0.7043 0.7371
model2 7 0.7074 0.734
model3 1.0 0.7393
model3 3 0.6863 0.743
model3 4 0.6926 0.7488
model3 5 0.7063 0.7469
model3 6 0.7197 0.746
model3 7 0.75 0.7449
model4 0.945 0.7397


In [16]:
final_model = RandomForestClassifier(max_depth=4, random_state=0).fit(xtrain, ytrain)

print('final model', get_scores(final_model, xtrain, xtest, ytrain, ytest))

final model 0.6934 0.7486


### 6. Save Result

In [17]:
y_pred = final_model.predict(X_TEST)
obj = {'ID' : ID,
       'Reached.on.Time_Y.N' : y_pred}

result = pd.DataFrame(obj)
result.to_csv("./result.csv", index=False)