In [10]:
!git clone https://github.com/ardahk/amex.git

Cloning into 'amex'...
remote: Enumerating objects: 216, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 216 (delta 52), reused 58 (delta 22), pack-reused 114 (from 1)[K
Receiving objects: 100% (216/216), 364.96 MiB | 10.33 MiB/s, done.
Resolving deltas: 100% (99/99), done.
Updating files: 100% (48/48), done.


In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [16]:
df = pd.read_csv('amex/data/final_data.csv')

In [17]:
df.head()

Unnamed: 0,order_id,user_id,gender,num_of_item,id_orders,product_id,order_created_at,status_cancelled,status_complete,status_processing,...,retail_price,department,id_events,sequence_number,created_at,state_events,postal_code_events,browser,traffic_source_events,event_type
0,112736,90067,F,2,163681,9588,2022-11-27 13:28:30+00:00,0,0,1,...,36.0,Women,1176673,7,2022-11-27 13:28:30+00:00,South Carolina,29588,Firefox,Adwords,purchase
1,185,149,F,2,279,1238,2021-08-30 08:34:34+00:00,1,0,0,...,31.92,Women,2113,7,2021-08-30 08:34:34+00:00,Florida,33317,Chrome,Organic,purchase
2,185,149,F,2,278,5485,2021-09-02 10:08:47+00:00,1,0,0,...,34.990002,Women,2106,7,2021-09-02 10:08:47+00:00,Florida,33317,IE,Email,purchase
3,59511,47530,F,3,86530,5485,2022-10-18 08:58:43+00:00,0,1,0,...,34.990002,Women,622843,10,2022-10-18 08:58:43+00:00,New York,13021,Chrome,Email,purchase
4,33505,26850,F,2,48710,5485,2023-03-25 13:52:09+00:00,0,0,0,...,34.990002,Women,350593,7,2023-03-25 13:52:09+00:00,Massachusetts,1701,Chrome,Facebook,purchase


In [18]:
df.columns

Index(['order_id', 'user_id', 'gender', 'num_of_item', 'id_orders',
       'product_id', 'order_created_at', 'status_cancelled', 'status_complete',
       'status_processing', 'status_returned', 'status__shipped', 'age',
       'state_orders', 'postal_code_orders', 'traffic_source_orders', 'cost',
       'category', 'name', 'brand', 'retail_price', 'department', 'id_events',
       'sequence_number', 'created_at', 'state_events', 'postal_code_events',
       'browser', 'traffic_source_events', 'event_type'],
      dtype='object')

In [19]:
print(df.shape)
df.dtypes

(19696, 30)


Unnamed: 0,0
order_id,int64
user_id,int64
gender,object
num_of_item,int64
id_orders,int64
product_id,int64
order_created_at,object
status_cancelled,int64
status_complete,int64
status_processing,int64


In [7]:
to_encode = df.select_dtypes('object').columns
to_encode

Index(['gender', 'order_created_at', 'state_orders', 'traffic_source_orders',
       'category', 'name', 'brand', 'department', 'created_at', 'state_events',
       'browser', 'traffic_source_events', 'event_type'],
      dtype='object')

In [8]:
for i in to_encode:
    print(i, ":", df[i].nunique())

gender : 2
order_created_at : 19692
state_orders : 49
traffic_source_orders : 5
category : 26
name : 13503
brand : 2088
department : 2
created_at : 19692
state_events : 49
browser : 5
traffic_source_events : 5
event_type : 1


In [9]:
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder

to_encode = ['gender', 'traffic_source_orders', 'state_events',
       'category', 'department',
       'browser', 'traffic_source_events', 'event_type']

enc = OneHotEncoder(sparse=False)
df_enc = pd.DataFrame(enc.fit_transform(df[to_encode]))
df_enc.columns = enc.get_feature_names_out(to_encode)
df = df.drop(columns = to_encode, axis = 1)
df = df.join(df_enc)
df.head()



Unnamed: 0,order_id,user_id,num_of_item,id_orders,product_id,order_created_at,status_cancelled,status_complete,status_processing,status_returned,...,browser_Firefox,browser_IE,browser_Other,browser_Safari,traffic_source_events_Adwords,traffic_source_events_Email,traffic_source_events_Facebook,traffic_source_events_Organic,traffic_source_events_YouTube,event_type_purchase
0,112736,90067,2,163681,9588,2022-11-27 13:28:30+00:00,0,0,1,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,185,149,2,279,1238,2021-08-30 08:34:34+00:00,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,185,149,2,278,5485,2021-09-02 10:08:47+00:00,1,0,0,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,59511,47530,3,86530,5485,2022-10-18 08:58:43+00:00,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,33505,26850,2,48710,5485,2023-03-25 13:52:09+00:00,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
df['category']

Unnamed: 0,category
0,Sleep & Lounge
1,Sweaters
2,Pants & Capris
3,Pants & Capris
4,Pants & Capris
...,...
19691,Outerwear & Coats
19692,Swim
19693,Suits & Sport Coats
19694,Sleep & Lounge


In [10]:
#df[['state_events', 'state_orders']]

In [11]:
# from sklearn.preprocessing import LabelEncoder

# # Creating a instance of label Encoder.
# le = LabelEncoder()

# # Using .fit_transform function to fit label
# # encoder and return encoded label
# label = le.fit_transform(df['state_events'])

# le_1 = LabelEncoder()

# # Using .fit_transform function to fit label
# # encoder and return encoded label
# label_1 = le_1.fit_transform(df['state_orders'])

# df.drop(["state_events", "state_orders"], axis=1, inplace=True)

# # Appending the array to our dataFrame
# # with column name 'Purchased'
# df["state_events"] = label
# df['state_orders'] = label_1
# # printing Dataframe
# print(df['state_events'])
# df['state_orders']

In [10]:
df.drop(['order_created_at', 'name', 'brand', 'created_at', 'state_orders'], inplace=True, axis=1)

In [11]:
df.shape

(19696, 112)

In [12]:
y = df['product_id']
X = df.drop(columns = 'product_id', axis=1)

In [13]:
df['product_id'].value_counts()

Unnamed: 0_level_0,count
product_id,Unnamed: 1_level_1
26053,6
9490,6
17164,5
3994,5
10897,5
...,...
8420,1
4345,1
3713,1
7449,1


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 123)

In [14]:
def train_test_DT(X_train, X_test, y_train, y_test, depth, leaf=1): #crit='entropy'
    # YOUR CODE HERE
    #criterion = crit
    model = DecisionTreeClassifier( max_depth = depth, min_samples_leaf = leaf, min_samples_split = 2)
    model.fit(X_train, y_train)
    class_label_predictions = model.predict(X_test)
    acc_score = accuracy_score(y_test, class_label_predictions)

    return acc_score

In [1]:
# YOUR CODE HERE
# depth1= 12
# depth2 = 32

# max_depth_range = [depth1, depth2]
# acc = []

# for md in max_depth_range:
#     # YOUR CODE HERE
#     score = train_test_DT(X_train, X_test, y_train, y_test, md, 1)
#     print('Max Depth=' + str(md) + ', accuracy score: ' + str(score))
#     acc.append(float(score))
acc = []
score = train_test_DT(X_train, X_test, y_train, y_test, 32, 1)
print('Max Depth=' + str(32) + ', accuracy score: ' + str(score))
acc.append(float(score))

NameError: name 'train_test_DT' is not defined

In [None]:
def visualize_accuracy(hyperparam_list, acc_scores_list):
    fig = plt.figure()
    ax = fig.add_subplot(111)

    p = sns.lineplot(x=hyperparam_list, y=acc_scores_list, marker='o')

    ax.set_xlabel('hyperparam')
    ax.set_ylabel('Accuracy')
    plt.show()

In [None]:
visualize_accuracy(max_depth_range, acc)

In [None]:
# next steps:
# add cross validation, grid search