In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/e-commerce-customer-behavior-dataset/E-commerce Customer Behavior - Sheet1.csv")

In [None]:
df.head()

In [None]:
df.drop("Customer ID", axis=1, inplace=True)

In [None]:
categorical_cols = ["Gender", "City", "Membership Type", "Discount Applied", "Satisfaction Level"]
numerical_cols = []
for i in df.columns:
    if i not in categorical_cols:
        numerical_cols += [i]

In [None]:
df.shape

In [None]:
def plot(numerical, categorical, axes):
    grouped = df.groupby(categorical)
    p_df = pd.DataFrame(grouped[numerical].sum())
    p_df = p_df.sort_values(numerical, ascending=False)
    sns.barplot(x=p_df.index, y=p_df[numerical], ax=axes)
    for container in axes.containers:
        axes.bar_label(container, size=12)
        
    if len(p_df) >= 4:
        axes.set_xticklabels(axes.get_xticklabels(), rotation=45)
        
    axes.set_title("Sum of {} by {}".format(numerical, categorical))

In [None]:
sns.pairplot(df,
             vars=["Age", "Total Spend", "Items Purchased", "Average Rating", "Days Since Last Purchase"],
             hue="Satisfaction Level")

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(17, 6))
index = 0
for i in ["City", "Gender", "Membership Type", "Satisfaction Level"]:
    plot("Total Spend", i, axes[index])
    index += 1
plt.tight_layout()
plt.show()

In [None]:
le = LabelEncoder()
encoders = []
for i in categorical_cols:
    df[i] = le.fit_transform(df[i])
    encoders += [le]

In [None]:
norm = Normalizer()
df[numerical_cols] = norm.fit_transform(df[numerical_cols])

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    report = classification_report(y_test, pred)
    print("Accuracy: {}".format(accuracy))
    print("Report: \n{}".format(report))
    sns.heatmap(cm, annot=True)
    plt.title(name)
    plt.show()

In [None]:
svc = SVC(C=0.9, kernel="linear")
knn = KNeighborsClassifier(n_neighbors=5)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier(learning_rate=0.5)
xgb = XGBClassifier()
models = [svc, knn, rfc, gbc, abc, xgb]
names = ["Support Vector Machine, C=0.9, linear kernel",
        "KNN Classifier with 5 neighbors",
        "Random Forest",
        "Gradient Boosting",
        "Ada Boost lr=0.5",
        "XGB"]

In [None]:
for i, j in zip(models, names):
    training(i, j)