# Importing & Loading Data

In [1520]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.utils import class_weight


In [1521]:
df = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/main/data/final_data.csv')

# Data Preparation & Preprocessing

**Features used for models:**

* 'user_id',
* 'num_of_item',
* 'product_id',
* 'status_cancelled',
* 'age',
* 'state_orders',
* 'postal_code_orders',
* 'traffic_source_orders',
* 'cost',
* 'category',
* 'brand',
* 'retail_price',
* 'department',
* 'postal_code_events',
* 'order_hour',
* 'order_day_of_week',
* 'order_month'

**Target Variable:**
*   status_complete (binary: 1 for completed, 0 for not completed)

In [1522]:
# Full list of columns
list(df.columns)

['order_id',
 'user_id',
 'gender',
 'num_of_item',
 'id_orders',
 'product_id',
 'order_created_at',
 'status_cancelled',
 'status_complete',
 'status_processing',
 'status_returned',
 'status__shipped',
 'age',
 'state_orders',
 'postal_code_orders',
 'traffic_source_orders',
 'cost',
 'category',
 'name',
 'brand',
 'retail_price',
 'department',
 'id_events',
 'sequence_number',
 'created_at',
 'state_events',
 'postal_code_events',
 'browser',
 'traffic_source_events',
 'event_type']

In [1523]:
# Dropping unwanted columns
dropped_columns = ['gender','order_id','event_type','state_events','created_at','name','sequence_number','status_processing','status_returned','status__shipped','id_orders','browser', 'traffic_source_events','id_events']
df.drop(columns=dropped_columns, inplace=True)

In [1524]:
df.head()

Unnamed: 0,user_id,num_of_item,product_id,order_created_at,status_cancelled,status_complete,age,state_orders,postal_code_orders,traffic_source_orders,cost,category,brand,retail_price,department,postal_code_events
0,90067,2,9588,2022-11-27 13:28:30+00:00,0,0,34,South Carolina,29588,Search,22.068,Sleep & Lounge,Calvin Klein,36.0,Women,29588
1,149,2,1238,2021-08-30 08:34:34+00:00,1,0,22,Florida,33317,Facebook,15.51312,Sweaters,Sag Harbor,31.92,Women,33317
2,149,2,5485,2021-09-02 10:08:47+00:00,1,0,22,Florida,33317,Facebook,18.019851,Pants & Capris,Woman Within,34.990002,Women,33317
3,47530,3,5485,2022-10-18 08:58:43+00:00,0,1,32,New York,13021,Search,18.019851,Pants & Capris,Woman Within,34.990002,Women,13021
4,26850,2,5485,2023-03-25 13:52:09+00:00,0,0,13,Massachusetts,1701,Search,18.019851,Pants & Capris,Woman Within,34.990002,Women,1701


In [1525]:
# Changed the "order_created_at" into datetime data type
df['order_created_at'] = pd.to_datetime(df['order_created_at'], errors = 'coerce')

df = df.dropna(subset='order_created_at', how='all')

df['order_created_at']

Unnamed: 0,order_created_at
0,2022-11-27 13:28:30+00:00
1,2021-08-30 08:34:34+00:00
2,2021-09-02 10:08:47+00:00
3,2022-10-18 08:58:43+00:00
4,2023-03-25 13:52:09+00:00
...,...
19691,2023-09-07 07:44:49+00:00
19692,2023-01-22 23:25:21+00:00
19693,2022-09-15 12:46:49+00:00
19694,2024-01-02 10:44:39+00:00


In [1526]:
# Convert 'order_created_at' to datetime object
df.loc[:, 'order_created_at'] = pd.to_datetime(df['order_created_at'])

In [1527]:
# This avoids SettingWithCopyWarning
df = df.copy()

# Extract hour, day of the week, and month from 'order_created_at' and create new columns
df['order_hour'] = df['order_created_at'].dt.hour
df['order_day_of_week'] = df['order_created_at'].dt.dayofweek
df['order_month'] = df['order_created_at'].dt.month

# Drop the original 'order_created_at' column
df.drop(columns='order_created_at', inplace=True)

print(df[['order_hour', 'order_day_of_week', 'order_month']].head())

   order_hour  order_day_of_week  order_month
0          13                  6           11
1           8                  0            8
2          10                  3            9
3           8                  1           10
4          13                  5            3


In [1528]:
# Get a list of all the object datatypes (Strings)
object_columns = list(df.select_dtypes(include='object').columns)
object_columns

['state_orders', 'traffic_source_orders', 'category', 'brand', 'department']

In [1529]:
# Before converting to numeric values
df[object_columns]

Unnamed: 0,state_orders,traffic_source_orders,category,brand,department
0,South Carolina,Search,Sleep & Lounge,Calvin Klein,Women
1,Florida,Facebook,Sweaters,Sag Harbor,Women
2,Florida,Facebook,Pants & Capris,Woman Within,Women
3,New York,Search,Pants & Capris,Woman Within,Women
4,Massachusetts,Search,Pants & Capris,Woman Within,Women
...,...,...,...,...,...
19691,Georgia,Search,Outerwear & Coats,Pearl iZUMi,Men
19692,New York,Search,Swim,Gary Majdell Sport,Men
19693,Illinois,Search,Suits & Sport Coats,Allegra K,Men
19694,Hawaii,Search,Sleep & Lounge,Ed Hardy,Men


In [1530]:
# Preprocessing: Convert categorical features to numeric values
label_encoders = {}
for col in object_columns:
    le = LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [1531]:
# After converting to numeric values
df[object_columns]

Unnamed: 0,state_orders,traffic_source_orders,category,brand,department
0,39,4,17,339,1
1,8,2,22,1568,1
2,8,2,13,1992,1
3,31,4,13,1992,1
4,20,4,13,1992,1
...,...,...,...,...,...
19691,9,4,11,1407,0
19692,31,4,23,742,0
19693,12,4,21,74,0
19694,10,4,17,579,0


In [1532]:
# Scaling all large values to be between 0 and 1 for better performance
scaler = MinMaxScaler()

scale_list = ['traffic_source_orders','category','brand','postal_code_events','user_id','product_id','num_of_item', 'age', 'state_orders', 'postal_code_orders', 'cost', 'retail_price', 'order_hour', 'order_day_of_week', 'order_month']

df[scale_list] = scaler.fit_transform(df[scale_list])

In [1533]:
# Normalizing price and retail price using Min-Max Scaling
scaler = MinMaxScaler()
col = ['cost', 'retail_price']
df.loc[:, col] = scaler.fit_transform(df[['cost', 'retail_price']])

In [1534]:
# Final dataset that will be used for the models
df.head()

Unnamed: 0,user_id,num_of_item,product_id,status_cancelled,status_complete,age,state_orders,postal_code_orders,traffic_source_orders,cost,category,brand,retail_price,department,postal_code_events,order_hour,order_day_of_week,order_month
0,0.900739,0.333333,0.329166,0,0,0.37931,0.8125,0.289605,1.0,0.264772,0.68,0.163138,0.211672,1,0.289605,0.565217,1.0,0.909091
1,0.00146,0.333333,0.042382,1,0,0.172414,0.166667,0.327383,0.5,0.186097,0.88,0.754572,0.187669,1,0.327383,0.347826,0.0,0.636364
2,0.00146,0.333333,0.188247,1,0,0.172414,0.166667,0.327383,0.5,0.216184,0.52,0.958614,0.20573,1,0.327383,0.434783,0.5,0.727273
3,0.475322,0.666667,0.188247,0,1,0.344828,0.645833,0.121764,1.0,0.216184,0.52,0.958614,0.20573,1,0.121764,0.347826,0.166667,0.818182
4,0.2685,0.333333,0.188247,0,0,0.017241,0.416667,0.007082,1.0,0.216184,0.52,0.958614,0.20573,1,0.007082,0.565217,0.833333,0.181818


# Initialize Decision Tree Model

In [1535]:
feature_list = df.columns.drop('status_complete').to_list()
feature_list

['user_id',
 'num_of_item',
 'product_id',
 'status_cancelled',
 'age',
 'state_orders',
 'postal_code_orders',
 'traffic_source_orders',
 'cost',
 'category',
 'brand',
 'retail_price',
 'department',
 'postal_code_events',
 'order_hour',
 'order_day_of_week',
 'order_month']

In [1536]:
# Features and target variable
X = df[feature_list]
y = df['status_complete']

In [1537]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1538]:
# Initialize and train the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [1539]:
# Make predictions
y_pred = model.predict(X_test)

In [1540]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.719147830605352


# Initialize Neural Network Model

In [1541]:
# Build the neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Input layer
    tf.keras.layers.Dense(32, activation='relu'),  # Hidden layer
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7469 - loss: 0.5498 - val_accuracy: 0.7529 - val_loss: 0.5194
Epoch 2/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7426 - loss: 0.5266 - val_accuracy: 0.7529 - val_loss: 0.5217
Epoch 3/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7444 - loss: 0.5230 - val_accuracy: 0.7529 - val_loss: 0.5178
Epoch 4/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7407 - loss: 0.5260 - val_accuracy: 0.7529 - val_loss: 0.5156
Epoch 5/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7434 - loss: 0.5210 - val_accuracy: 0.7529 - val_loss: 0.5187
Epoch 6/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7400 - loss: 0.5223 - val_accuracy: 0.7529 - val_loss: 0.5166
Epoch 7/10
[1m481/481[0m [32m━━━━━━━