In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('ecommerce_data.csv')

In [3]:
df.columns

Index(['order_id', 'customer_name', 'product_name', 'category', 'quantity',
       'unit_price', 'total_price', 'discount', 'total_discount',
       'coupon_code', 'payment_method', 'order_status', 'order_date',
       'cost_price', 'profit'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,order_id,customer_name,product_name,category,quantity,unit_price,total_price,discount,total_discount,coupon_code,payment_method,order_status,order_date,cost_price,profit
0,5e04caf7-7120-4156-9fc3-5727d6434fc8,Daniel Werner,Smartwatch,Toys,9,783.62,7052.58,1319.25,5733.33,FREESHIP,Gift Card,Cancelled,2024-05-08,434.19,1825.62
1,dcc375de-99ff-4e35-9943-079102c54161,Scott Jones,Camera,Beauty,1,685.14,685.14,120.16,564.98,SUMMER50,Gift Card,Refunded,2024-09-15,470.47,94.51
2,18858024-5cfe-49d5-84f8-960533de8d38,Hailey Robinson,Smartphone,Fashion,6,362.93,2177.58,321.44,1856.14,SAVE10,Bitcoin,Refunded,2024-07-17,191.26,708.58
3,da8745c3-1ab4-499f-b061-d90ca4d2151c,Terri Harrington,Headphones,Home,2,235.86,471.72,103.2,368.52,SAVE10,Bitcoin,Refunded,2024-10-15,155.87,56.78
4,955e97a6-ff47-458e-a5a2-0b6dce671358,Michelle Bowen,Smartwatch,Home,3,618.93,1856.79,519.12,1337.67,FREESHIP,Credit Card,Completed,2024-06-12,338.94,320.85


test model

In [10]:
x = df.drop(['order_id','customer_name','order_date'],axis = 1)
x.head()

Unnamed: 0,product_name,category,quantity,unit_price,total_price,discount,total_discount,coupon_code,payment_method,order_status,cost_price,profit
0,Smartwatch,Toys,9,783.62,7052.58,1319.25,5733.33,FREESHIP,Gift Card,Cancelled,434.19,1825.62
1,Camera,Beauty,1,685.14,685.14,120.16,564.98,SUMMER50,Gift Card,Refunded,470.47,94.51
2,Smartphone,Fashion,6,362.93,2177.58,321.44,1856.14,SAVE10,Bitcoin,Refunded,191.26,708.58
3,Headphones,Home,2,235.86,471.72,103.2,368.52,SAVE10,Bitcoin,Refunded,155.87,56.78
4,Smartwatch,Home,3,618.93,1856.79,519.12,1337.67,FREESHIP,Credit Card,Completed,338.94,320.85


In [13]:
x.fillna('No',inplace=True)

#feature eng 
x['profit_margin'] = (x['profit'] / x['total_price']).replace([np.inf, -np.inf], 0)
x['discount_percent'] = (x['discount'] / x['total_price']).replace([np.inf, -np.inf], 0)
x['Price_diff'] = (x['unit_price'] - x['cost_price'])

In [15]:
x.head()

Unnamed: 0,product_name,category,quantity,unit_price,total_price,discount,total_discount,coupon_code,payment_method,order_status,cost_price,profit,profit_margin,discount_percent,Price_diff
0,Smartwatch,Toys,9,783.62,7052.58,1319.25,5733.33,FREESHIP,Gift Card,Cancelled,434.19,1825.62,0.258858,0.187059,349.43
1,Camera,Beauty,1,685.14,685.14,120.16,564.98,SUMMER50,Gift Card,Refunded,470.47,94.51,0.137943,0.17538,214.67
2,Smartphone,Fashion,6,362.93,2177.58,321.44,1856.14,SAVE10,Bitcoin,Refunded,191.26,708.58,0.325398,0.147613,171.67
3,Headphones,Home,2,235.86,471.72,103.2,368.52,SAVE10,Bitcoin,Refunded,155.87,56.78,0.120368,0.218774,79.99
4,Smartwatch,Home,3,618.93,1856.79,519.12,1337.67,FREESHIP,Credit Card,Completed,338.94,320.85,0.172798,0.279579,279.99


In [16]:
lbenc = LabelEncoder()

catcol = ['product_name','category','coupon_code','payment_method','order_status']
for col in catcol:
    x[col] = lbenc.fit_transform(x[col])

In [17]:
scaler = StandardScaler()
X = scaler.fit_transform(x)

IsolationForest 

In [18]:
isfor = IsolationForest(contamination=0.03, random_state=0,n_estimators=125) # assumed 3% is fraud
x['anomaly'] = isfor.fit_predict(X)
x['fraud'] = x['anomaly'].apply(lambda x: 1 if x == -1 else 0)

print("Potential Fraud:", x['fraud'].sum())

Potential Fraud: 3000


Kmeans

In [21]:
frdcluster = KMeans(n_clusters=2, random_state=0) # 2 classes: fraud and fraudn't

x['cluster'] = frdcluster.fit_predict(X)

fraudclust = x['cluster'].value_counts().idxmin()
x['is_fraud_kmeans'] = x['cluster'].apply(lambda x: 1 if x == fraudclust else 0)
print("Potential Fraud:", x['is_fraud_kmeans'].sum())

Potential Fraud: 39271
