In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ETL Dataset

In [None]:
df=pd.read_excel("/kaggle/input/sales-ecommerce/2_Ecommerce Sales Data Analysis Excel.xlsx")

In [None]:
df.head()

# Recency

In [None]:
df["Date"]=pd.to_datetime(df["Order Date"])
df["rank"]=df.sort_values(["Order ID","Date"]).groupby(["Customer ID"])["Date"].rank(method='min').astype(int)
df_rec=df[df["rank"]==1]

In [None]:
df_rec["recency"]=(df_rec["Date"] - pd.to_datetime(min(df_rec["Date"]))).dt.days

# frequency

In [None]:
freq=df_rec.groupby('Customer ID')["Date"].count()
df_freq=pd.DataFrame(freq).reset_index()
df_freq.columns=['Customer ID','frequency']

In [None]:
rec_freq=df_freq.merge(df_rec,on="Customer ID")

In [None]:
rec_freq

In [None]:
def calculate_total(row):
    if row["Discount"]==0:
        return row["Quantity"]*row["Sales"] 
    elif row["Discount"]%row["Sales"]!=0 or row["Discount"]%row["Quantity"]!=0:
        return row["Quantity"]*row["Sales"] / row["Discount"]
    else:
        return "error"
rec_freq["total"]=rec_freq.apply(calculate_total,axis=1)
m=rec_freq.groupby("Customer ID")["total"].sum()
m=pd.DataFrame(m).reset_index()
m.columns=["Customer ID","monetary_values"]

# calculate total and monetary value

In [None]:
rfm=m.merge(rec_freq,on="Customer ID")

In [None]:
finaldf=rfm[["Customer ID","recency","frequency","monetary_values"]]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
list=["recency","frequency","monetary_values"]
for i in list:
    print(str(i)+": ")
    ax=sns.boxplot(x=finaldf[str(i)])
    plt.show()

# remove outliers

In [None]:
from scipy import stats
new_df=finaldf[["recency","frequency","monetary_values"]]
z_score=stats.zscore(new_df)
abs_z=np.abs(z_score)
filtered=(abs_z<4).all(axis=1)
new_df=new_df[filtered]

In [None]:
new_df

In [None]:
from sklearn.preprocessing import StandardScaler
new_df=new_df.drop_duplicates()
col_names=["recency","frequency","monetary_values"]
features=new_df[col_names]
scaler=StandardScaler().fit(features.values)
features=scaler.transform(features.values)
scaled_f=pd.DataFrame(features,columns=col_names)

# building customer segment model

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
SSE=[]
for cluster in range(1,8):
    kmeans=KMeans(n_clusters=cluster,init='k-means++')
    kmeans.fit(scaled_f)
    SSE.append(kmeans.inertia_)
frame=pd.DataFrame({'cluster':range(1,8),'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['cluster'],frame['SSE'],marker='o')
plt.xlabel("num of cluster")
plt.ylabel("inertia")

In [None]:
kmeans=KMeans(n_clusters=4,init='k-means++')
kmeans.fit(scaled_f)

In [None]:
print(silhouette_score(scaled_f,kmeans.labels_,metric='euclidean'))

In [None]:
predict=kmeans.predict(scaled_f)
frame=pd.DataFrame(new_df)
frame['cluster']=predict

In [None]:
frame

In [None]:
avg_df=frame.groupby(['cluster'],as_index=False).mean()
for i in list:
    sns.barplot(x='cluster',y=str(i),data=avg_df)
    plt.show()