# Local Model

In [94]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from scipy.cluster import hierarchy

import sqlite3
from sqlalchemy               import create_engine
from sqlalchemy.pool          import NullPool

# from sklearn.decomposition import PCA
# from umap.umap_ import UMAP
# from sklearn.manifold import TSNE

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [96]:
PATH = 'C:/Users/Alysson/Documents/Projects/E-Commerce-Clusterization/data/ecommerce.csv'

data_raw = pd.read_csv(PATH, encoding='iso-8859-1')
data = data_raw.copy()

In [97]:
#pip install schedule

In [98]:
data = data.drop('Unnamed: 8', axis=1)
data = data.dropna(subset=['CustomerID'])

In [99]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%b-%y')
data['CustomerID'] = data['CustomerID'].astype(int)
data['Total'] = data['Quantity'] * data['UnitPrice']

In [100]:
sum_transactions_per_client=data[['CustomerID','Total','Quantity']].groupby('CustomerID').agg({'Total':np.sum,
                                                                                               'Quantity':np.sum
                                                                                              }).reset_index()

In [101]:
sum_transactions_per_client

Unnamed: 0,CustomerID,Total,Quantity
0,12346,0.00,0
1,12347,4310.00,2458
2,12348,1797.24,2341
3,12349,1757.55,631
4,12350,334.40,197
...,...,...,...
4367,18280,180.60,45
4368,18281,80.82,54
4369,18282,176.60,98
4370,18283,2094.88,1397


In [102]:
#Customers who do not have a positive purchase balance or who owe the company (due to the temporal cut of the database) will be excluded

bad_clients = sum_transactions_per_client.loc[(sum_transactions_per_client['Total'] <= 0.5) | (sum_transactions_per_client['Quantity'] <= 1)]

In [103]:
list_bad_clients=bad_clients['CustomerID'].tolist()
data = data[~data['CustomerID'].isin(list_bad_clients)]

In [104]:
data = data.loc[~(data['UnitPrice'] < 0.04)]

In [105]:
list_letter_stock=data[data.StockCode.str.contains("^[a-zA-Z]")].StockCode.value_counts().index.tolist()

In [106]:
data = data.loc[~data['StockCode'].isin(list_letter_stock)]

In [107]:
transactions = data.copy()

In [108]:
#Group InvoiceNumber, it contains sales and cancelations

transactions=data.groupby('InvoiceNo').agg( CustomerID = ('CustomerID', np.unique),
                                            InvoiceDate = ('InvoiceDate', np.unique),
                                            Total = ('Total', 'sum'),
                                            UniqueProducts = ('StockCode', 'nunique'), 
                                            Items = ('Quantity', 'sum'),
                                            Country = ('Country', np.unique),
                                            ProductsCode = ('StockCode', np.unique)).reset_index()

transactions['AvarageTicket']= round(transactions['Total']/transactions['UniqueProducts'],2) 
#len(transactions)

In [109]:
last_day = data.InvoiceDate.max() + dt.timedelta(days = 1)

transactions_per_customer = transactions.groupby('CustomerID').agg(
                                                      GrossRevenue = ('Total', np.sum),                                           
                                                      Recency = ('InvoiceDate', lambda x: ((last_day - x.max()).days)),             
                                                      Frequency = ('InvoiceNo', 'count'),             
                                                      Products = ('UniqueProducts', 'sum'), 
                                                      Items = ('Items', 'sum'),  
                                                      Country = ('Country', np.unique),
                                                      AvarageTicket = ('AvarageTicket', 'sum'))            
                                                      #Products = ('StockCode', np.unique),
                                                                   
#transactions_per_customer['AvarageTicket']= round(transactions_per_customer['GrossRevenue'] / transactions_per_customer['Products'],2)

In [110]:
transactions

Unnamed: 0,InvoiceNo,CustomerID,InvoiceDate,Total,UniqueProducts,Items,Country,ProductsCode,AvarageTicket
0,536365,17850,2016-11-29,139.12,7,40,United Kingdom,"[21730, 22752, 71053, 84029E, 84029G, 84406B, ...",19.87
1,536366,17850,2016-11-29,22.20,2,12,United Kingdom,"[22632, 22633]",11.10
2,536367,13047,2016-11-29,278.73,12,83,United Kingdom,"[21754, 21755, 21777, 22310, 22622, 22623, 227...",23.23
3,536368,13047,2016-11-29,70.05,4,15,United Kingdom,"[22912, 22913, 22914, 22960]",17.51
4,536369,13047,2016-11-29,17.85,1,3,United Kingdom,21756,17.85
...,...,...,...,...,...,...,...,...,...
21700,C581470,17924,2017-12-06,-8.32,1,-4,United Kingdom,23084,-8.32
21701,C581484,16446,2017-12-07,-168469.60,1,-80995,United Kingdom,23843,-168469.60
21702,C581490,14397,2017-12-07,-32.53,2,-23,United Kingdom,"[22178, 23144]",-16.26
21703,C581568,15311,2017-12-07,-54.75,1,-5,United Kingdom,21258,-54.75


## Data Preparation

In [111]:
data_prep = transactions_per_customer.copy()

In [113]:
data_prep=data_prep.dropna()

In [114]:
categorical_features = ['Country']
numerical_features = ['GrossRevenue', 'Items', 'AvarageTicket', "Products", 'Frequency', "Recency"]

#data_prep["Country"] = [0 if i == 'Norway' else 1 for i in data_prep["Country"]]

default_features = data_prep.columns
selected_features = ['GrossRevenue','Recency','Frequency']

data_prep=data_prep[selected_features].copy() 

In [115]:
log_columns = data_prep[selected_features].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
log_columns

GrossRevenue    21.526419
Frequency       11.401617
Recency          1.265656
dtype: float64

In [116]:
# The log transformations
for col in log_columns.index:
    data_prep[col] = np.log1p(data_prep[col])

In [120]:
#ss = StandardScaler()
#rs = RobustScaler()
#pt = PowerTransformer()

# MinmmaxScaler provides better resutls mainly because is robbust to outliers
mms = MinMaxScaler()

for col in data_prep[selected_features]:
    data_prep[col] = mms.fit_transform(data_prep[[col]]).squeeze()
    
X = data_prep[selected_features].copy() 

In [121]:
clusters = X.copy()

## Model - Hierachical Cluster

In [125]:
k=10
#hc = AgglomerativeClustering(k, affinity='euclidean', linkage='complete', compute_full_tree=False)
hc = hierarchy.linkage(X, 'ward', metric='euclidean')
hc_labels = hierarchy.fcluster(hc, k, criterion='maxclust')
clusters["HierarchicalCluster"]=hc_labels

In [128]:
transactions_per_customer['Cluster'] = hc_labels

In [129]:
transactions = transactions.merge(transactions_per_customer[['CustomerID','Cluster']], on='CustomerID')

In [130]:
cluster_dict = {1:"Potential Loyalist",
                2:"About to Sleep",
                3:"Champion",
                4:"Loyal Costumer",
                5:"Promising",
                6:"Hinernating",
                7:"Cannot Lose Them",
                8:"At Risk",
                9:"New Customers",
                10:"Need Atention"}  

In [131]:
transactions_per_customer['Cluster']=transactions_per_customer.Cluster.map(cluster_dict)
transactions['Cluster']=transactions.Cluster.map(cluster_dict)

In [132]:
transactions_per_customer.isna().sum()

CustomerID       0
GrossRevenue     0
Recency          0
Frequency        0
Products         0
Items            0
Country          0
AvarageTicket    0
Cluster          0
dtype: int64

## SQL Lite

In [161]:
insiders = transactions_per_customer.copy()
insiders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4312 entries, 0 to 4311
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CustomerID     4312 non-null   int64  
 1   GrossRevenue   4312 non-null   float64
 2   Recency        4312 non-null   int64  
 3   Frequency      4312 non-null   int64  
 4   Products       4312 non-null   int64  
 5   Items          4312 non-null   int64  
 6   Country        4312 non-null   object 
 7   AvarageTicket  4312 non-null   float64
 8   Cluster        4312 non-null   object 
dtypes: float64(2), int64(5), object(2)
memory usage: 303.3+ KB


In [172]:
endpoint = 'sqlite:///insiders_db.sqlite' #local
#endpoint = 'sqlite:////Users/Alysson/Documents/Projects/E-Commerce-Clusterization/scr/sqlite/insiders.sqlite' #local

db = create_engine(endpoint, poolclass=NullPool)
conn = db.connect()

In [173]:
insiders

Unnamed: 0,CustomerID,GrossRevenue,Recency,Frequency,Products,Items,Country,AvarageTicket,Cluster
0,12347,4310.00,3,7,182,2458,Iceland,161.68,Loyal Costumer
1,12348,1437.24,76,4,23,2332,Finland,308.64,At Risk
2,12349,1457.55,19,1,72,630,Italy,20.24,Promising
3,12350,294.40,311,1,16,196,Norway,18.40,Need Atention
4,12352,1265.41,37,8,84,463,Norway,113.84,About to Sleep
...,...,...,...,...,...,...,...,...,...
4307,18280,180.60,278,1,10,45,United Kingdom,18.06,Need Atention
4308,18281,80.82,181,1,7,54,United Kingdom,11.55,Need Atention
4309,18282,176.60,8,3,13,98,United Kingdom,28.44,Promising
4310,18283,2088.93,4,16,687,1395,United Kingdom,48.61,Champion


In [174]:
query_drop_insiders = """
    DROP TABLE insiders
"""
conn.execute( query_drop_insiders )

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1e3e84e7280>

In [175]:
#check if table exists on sqlite
check_table = """
     SELECT name FROM sqlite_master WHERE type='table' AND name='insiders';
 """
df_check = pd.read_sql_query(check_table, conn)

#0 = table does not exist, 1 = table exists
if len(df_check) == 0:  
    query_create_table_insiders = """
        CREATE TABLE insiders (
            CustomerID               INTEGER,
            GrossRevenue             REAL,
            Recency                  INTEGER,
            Frequency                INTEGER,
            Products                 INTEGER,
            Items                    INTEGER,
            Country                  TEXT,
            AvarageTicket            REAL,
            Cluster                  TEXT,
            LastTraining             TEXT
           ) """


    #conn = sqlite3.connect('insiders_db_sqlite')
    conn.execute( query_create_table_insiders )
    print('Table insiders was created!')
else:
    print('Table insiders exists!')

Table insiders was created!


In [176]:
insiders.to_sql('insiders', con=db, if_exists='replace', index=False )

In [177]:
#consult database
query = """
    SELECT * FROM insiders
"""
df = pd.read_sql_query(query, db)
df

Unnamed: 0,CustomerID,GrossRevenue,Recency,Frequency,Products,Items,Country,AvarageTicket,Cluster
0,12347,4310.00,3,7,182,2458,Iceland,161.68,Loyal Costumer
1,12348,1437.24,76,4,23,2332,Finland,308.64,At Risk
2,12349,1457.55,19,1,72,630,Italy,20.24,Promising
3,12350,294.40,311,1,16,196,Norway,18.40,Need Atention
4,12352,1265.41,37,8,84,463,Norway,113.84,About to Sleep
...,...,...,...,...,...,...,...,...,...
4307,18280,180.60,278,1,10,45,United Kingdom,18.06,Need Atention
4308,18281,80.82,181,1,7,54,United Kingdom,11.55,Need Atention
4309,18282,176.60,8,3,13,98,United Kingdom,28.44,Promising
4310,18283,2088.93,4,16,687,1395,United Kingdom,48.61,Champion


In [178]:
df.sort_values(by='GrossRevenue', ascending=False)

Unnamed: 0,CustomerID,GrossRevenue,Recency,Frequency,Products,Items,Country,AvarageTicket,Cluster
1679,14646,278778.02,2,73,2062,196556,Netherlands,17771.91,Champion
4178,18102,259657.30,1,60,431,64124,United Kingdom,43552.74,Champion
3711,17450,189735.53,9,49,339,69041,United Kingdom,26456.40,Champion
1868,14911,128882.13,2,242,5806,76848,EIRE,5235.49,Champion
54,12415,123638.18,25,24,774,76946,Australia,4512.36,Champion
...,...,...,...,...,...,...,...,...,...
2731,16093,17.00,107,1,1,20,United Kingdom,17.00,Need Atention
3941,17763,15.00,264,1,1,12,United Kingdom,15.00,Need Atention
717,13307,15.00,121,1,1,4,United Kingdom,15.00,Need Atention
3202,16738,3.75,298,1,1,3,United Kingdom,3.75,Need Atention


In [179]:
conn.close()