In [None]:
# Parameters
path_model_report_initial = "2022/10/21/11:27:58"


In [None]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from scipy.cluster import hierarchy

import sqlite3
from sqlalchemy               import create_engine
from sqlalchemy.pool          import NullPool

# from sklearn.decomposition import PCA
# from umap.umap_ import UMAP
# from sklearn.manifold import TSNE

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# import papermill as pm

# nb = pm.execute_notebook("model-deploy.ipynb", "model-deploy_2.ipynb", {"mill":"mill2"})

In [None]:
PATH = '/Users/Alysson/Documents/Projects/E-Commerce-Clusterization/data/ecommerce.csv'
data_raw = pd.read_csv(PATH, encoding='iso-8859-1')
data = data_raw.copy()

In [None]:
data = data.drop('Unnamed: 8', axis=1)
data = data.dropna(subset=['CustomerID'])

In [None]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%b-%y')
data['CustomerID'] = data['CustomerID'].astype(int)

data['Total'] = data['Quantity'] * data['UnitPrice']

In [None]:
sum_transactions_per_client=data[['CustomerID','Total','Quantity']].groupby('CustomerID').agg({'Total':np.sum,
                                                                 'Quantity':np.sum,                               
                                                                 #'CustomerID':np.unique                                                                                    
                                                                 }).reset_index()

In [None]:
#Customers who do not have a positive purchase balance or who owe the company (due to the temporal cut of the database) will be excluded

bad_clients = sum_transactions_per_client.loc[(sum_transactions_per_client['Total'] <= 0.5) | (sum_transactions_per_client['Quantity'] <= 1)]

In [None]:
list_bad_clients=bad_clients['CustomerID'].tolist()
data = data[~data['CustomerID'].isin(list_bad_clients)]

In [None]:
data = data.loc[~(data['UnitPrice'] < 0.04)]

In [None]:
list_letter_stock=data[data.StockCode.str.contains("^[a-zA-Z]")].StockCode.value_counts().index.tolist()

In [None]:
data = data.loc[~data['StockCode'].isin(list_letter_stock)]

In [None]:
transactions = data.copy()

In [None]:
#Group InvoiceNumber, it contains sales and cancelations

transactions=data.groupby('InvoiceNo').agg( CustomerID = ('CustomerID', np.unique),
                                            InvoiceDate = ('InvoiceDate', np.unique),
                                            Total = ('Total', 'sum'),
                                            UniqueProducts = ('StockCode', 'nunique'), 
                                            Items = ('Quantity', 'sum'),
                                            Country = ('Country', np.unique),
                                            ProductsCode = ('StockCode', np.unique)).reset_index()

transactions['AvarageTicket']= round(transactions['Total']/transactions['UniqueProducts'],2) 
#len(transactions)

In [None]:
last_day = data.InvoiceDate.max() + dt.timedelta(days = 1)

transactions_per_customer = transactions.groupby('CustomerID').agg(
                                                      GrossRevenue = ('Total', np.sum),                                           
                                                      Recency = ('InvoiceDate', lambda x: ((last_day - x.max()).days)),             
                                                      Frequency = ('InvoiceNo', 'count'),             
                                                      Products = ('UniqueProducts', 'sum'), 
                                                      Items = ('Items', 'sum'),  
                                                      Country = ('Country', np.unique),
                                                      AvarageTicket = ('AvarageTicket', 'sum'))            
                                                      #Products = ('StockCode', np.unique),
                                                                   
#transactions_per_customer['AvarageTicket']= round(transactions_per_customer['GrossRevenue'] / transactions_per_customer['Products'],2)

## Data Preparation

In [None]:
data_prep = transactions_per_customer.copy()

In [None]:
data_prep=data_prep.dropna()

In [None]:
categorical_features = ['Country']
numerical_features = ['GrossRevenue', 'Items', 'AvarageTicket', "Products", 'Frequency', "Recency"]

#data_prep["Country"] = [0 if i == 'Norway' else 1 for i in data_prep["Country"]]

default_features = data_prep.columns
selected_features = ['GrossRevenue','Recency','Frequency']

data_prep=data_prep[selected_features].copy() 

In [None]:
log_columns = data_prep[selected_features].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
log_columns

In [None]:
# The log transformations
for col in log_columns.index:
    data_prep[col] = np.log1p(data_prep[col])

In [None]:
#ss = StandardScaler()
#rs = RobustScaler()
#pt = PowerTransformer()

# MinmmaxScaler provides better resutls mainly because is robbust to outliers
mms = MinMaxScaler()

for col in data_prep[selected_features]:
    data_prep[col] = mms.fit_transform(data_prep[[col]]).squeeze()
    
X = data_prep[selected_features].copy() 

In [None]:
clusters_results = X.copy()

In [None]:
# # Dimensionality Reduction(to 2D) using technique UMAP 
# umap = UMAP(random_state=3456)
# umap_embedding = umap.fit_transform(X)

# # X,y axis representation for UMAP
# clusters_results['umap_x'] = umap_embedding[:,0]
# clusters_results['umap_y'] = umap_embedding[:,1]

In [None]:
# # Dimensionality Reduction(to 2D) using technique TSNE
# tsne = TSNE(n_components=2, init='pca', learning_rate='auto', n_jobs=-1, random_state=3456)
# tsne_embedding = tsne.fit_transform(X)

# # X,y axis representation using TSNE
# clusters_results['tsne_x'] = tsne_embedding[:,0]
# clusters_results['tsne_y'] = tsne_embedding[:,1]

## Model - Hierachical Cluster

In [None]:
k=10
#hc = AgglomerativeClustering(k, affinity='euclidean', linkage='complete', compute_full_tree=False)
hc = hierarchy.linkage(X, 'ward', metric='euclidean')
hc_labels = hierarchy.fcluster(hc, k, criterion='maxclust')
clusters_results["HierarchicalCluster"]=hc_labels

In [None]:
all_clusters=transactions_per_customer.copy()

In [None]:
all_clusters['HC'] = hc_labels

In [None]:
all_clusters[['GrossRevenue','Recency','Frequency','HC']].groupby('HC').mean().sort_values(by='GrossRevenue', ascending=False).head(20)

In [None]:
report = all_clusters[['GrossRevenue','Recency','Frequency','HC']].groupby('HC').agg( MonetarySum = ('GrossRevenue', 'sum'),
                                             MonetaryMean = ('GrossRevenue', 'mean'),
                                             Recency = ('Recency', 'mean'),
                                             Frequency = ('Frequency', 'mean'),
                                             Count = ('GrossRevenue', 'count')).reset_index()

report['Percentage'] = round((report['Count'] / report['Count'].sum()*100),2)
report=report.sort_values(by='MonetaryMean', ascending=False)

In [None]:
report['Cluster'] = ['Champion', #3
                     'Loyal Costumer', #4
                     'Potential Loyalist',#1
                     'Cannot Lose Them', #7 
                     'New Customers', #9
                     'Promising', #5
                     'About to Sleep',    #2               
                     'Need Atention', #10
                     'At Risk',   #8
                     'Hinernating']   #6

In [None]:
report.set_index('Cluster', inplace=True)
report.drop('HC', axis=1,inplace=True)

In [None]:
report

## SQL Lite

In [None]:
dp = transactions_per_customer.copy()
dp.info()

In [None]:
#endpoint = 'sqlite:///insiders_db.sqlite' #local
endpoint = 'sqlite:////Users/Alysson/Documents/Projects/E-Commerce-Clusterization/scr/sqlite/insiders_db.sqlite' #local


#C:\Users\Alysson\Documents\Projects\E-Commerce-Clusterization\scr\models
#postgre
#endpoint = f'postgresql://{pg_user}:{pg_passwd}@{pg_host}:{pg_port}'
#C:\Users\Alysson\Documents\Projects\E-Commerce-Clusterization\notebooks]

db = create_engine(endpoint, poolclass=NullPool)
conn = db.connect()

In [None]:
#check if table exists on sqlite
check_table = """
     SELECT name FROM sqlite_master WHERE type='table' AND name='insiders';
 """
df_check = pd.read_sql_query(check_table, conn)

#0 = table does not exist, 1 = table exists
if len(df_check) == 0:  
    query_create_table_insiders = """
        CREATE TABLE insiders (
            CustomerID               INTEGER,
            GrossRevenue             REAL,
            Recency                  INTEGER,
            Products                 INTEGER,
            Cluster                  INTEGER,
            LastTraining             TEXT        
           ) """


    conn = sqlite3.connect('insiders_db_sqlite')
    conn.execute( query_create_table_insiders )
    print('Table loyals was created!')
else:
    print('Table loyals exists!')

In [None]:
db = create_engine(endpoint, poolclass=NullPool)
dp.to_sql('insiders', con=db, if_exists='append', index=False )#index=False to ignore dataframe index

In [None]:
#consult database
query = """
    SELECT * FROM insiders
"""
df = pd.read_sql_query(query, db)
df

In [None]:
conn.close()