# Deploy Model - AWS

## Data Preprocessing

In [11]:
import os
import io
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date, timedelta
import psycopg2

import s3fs
import boto3
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.cluster import hierarchy
import dotenv

#import sqlite3
from sqlalchemy               import create_engine
from sqlalchemy.pool          import NullPool

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [12]:
#pip install python-dotenv

In [13]:
dotenv.load_dotenv(dotenv.find_dotenv())

True

In [14]:
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY')
AWS_DEFAULT_REGION = os.environ.get('AWS_DEFAULT_REGION')

In [15]:
s3 = boto3.resource(service_name='s3',
                             region_name=AWS_DEFAULT_REGION,
                             aws_access_key_id=AWS_ACCESS_KEY_ID,
                             aws_secret_access_key=AWS_SECRET_ACCESS_KEY)   

In [16]:
for bucket in s3.buckets.all():
    print(bucket.name)

insidersdataset


In [17]:
for obj in s3.Bucket('insidersdataset').objects.all():
    print(obj)

s3.ObjectSummary(bucket_name='insidersdataset', key='ecommerce.csv')


In [18]:
s3.Bucket('insidersdataset').Object('ecommerce.csv').get()

{'ResponseMetadata': {'RequestId': '77AE5DHPSJ56APC5',
  'HostId': 'UmJJL3Bmae69Up5VdE24Zg22+w0oOPrAWhU0k8W9+DVtluiWZPTFew+v7bvYe+KQ5fD11qrDzew=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'UmJJL3Bmae69Up5VdE24Zg22+w0oOPrAWhU0k8W9+DVtluiWZPTFew+v7bvYe+KQ5fD11qrDzew=',
   'x-amz-request-id': '77AE5DHPSJ56APC5',
   'date': 'Tue, 24 Jan 2023 17:13:33 GMT',
   'last-modified': 'Mon, 23 Jan 2023 13:06:11 GMT',
   'etag': '"84ab4db6c8b002df9053c0ccaea4096b-3"',
   'x-amz-server-side-encryption': 'AES256',
   'accept-ranges': 'bytes',
   'content-type': 'text/csv',
   'server': 'AmazonS3',
   'content-length': '42697197'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2023, 1, 23, 13, 6, 11, tzinfo=tzutc()),
 'ContentLength': 42697197,
 'ETag': '"84ab4db6c8b002df9053c0ccaea4096b-3"',
 'ContentType': 'text/csv',
 'ServerSideEncryption': 'AES256',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x270163dd6a0>}

In [19]:
obj = s3.Bucket('insidersdataset').Object('ecommerce.csv').get()
data_raw = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='iso-8859-1') 

In [20]:
data = data_raw.copy()

In [21]:
data = data.drop('Unnamed: 8', axis=1)
data = data.dropna(subset=['CustomerID'])

In [22]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%b-%y')
data['CustomerID'] = data['CustomerID'].astype(int)
data['Total'] = data['Quantity'] * data['UnitPrice']

In [25]:
#Customers who do not have a positive purchase balance or who owe the company (due to the temporal cut of the database) will be excluded

sum_transactions_per_client=data[['CustomerID','Total','Quantity']].groupby('CustomerID').agg({'Total':np.sum,'Quantity':np.sum}).reset_index()

bad_clients = sum_transactions_per_client.loc[(sum_transactions_per_client['Total'] <= 0.5) | (sum_transactions_per_client['Quantity'] <= 1)]

In [28]:
list_bad_clients=bad_clients['CustomerID'].tolist()
data = data[~data['CustomerID'].isin(list_bad_clients)]

In [29]:
data = data.loc[~(data['UnitPrice'] < 0.04)]

In [30]:
transactions = data.copy()

In [31]:
#Group InvoiceNumber, it contains sales and cancelations

transactions=data.groupby('InvoiceNo').agg( CustomerID = ('CustomerID', np.unique),
                                            InvoiceDate = ('InvoiceDate', np.unique),
                                            Total = ('Total', 'sum'),
                                            UniqueProducts = ('StockCode', 'nunique'), 
                                            Items = ('Quantity', 'sum'),
                                            Country = ('Country', np.unique),
                                            ProductsCode = ('StockCode', np.unique)).reset_index()

transactions['AvarageTicket']= round(transactions['Total']/transactions['UniqueProducts'],2) 
#len(transactions)

In [32]:
last_day = data.InvoiceDate.max() + dt.timedelta(days = 1)

transactions_per_customer = transactions.groupby('CustomerID').agg(
                                                      GrossRevenue = ('Total', np.sum),                                           
                                                      Recency = ('InvoiceDate', lambda x: ((last_day - x.max()).days)),             
                                                      Frequency = ('InvoiceNo', 'count'),             
                                                      Products = ('UniqueProducts', 'sum'), 
                                                      Items = ('Items', 'sum'),  
                                                      Country = ('Country', np.unique),
                                                      AvarageTicket = ('AvarageTicket', 'sum'))            
                                                      #Products = ('StockCode', np.unique),
                                                                   
#transactions_per_customer['AvarageTicket']= round(transactions_per_customer['GrossRevenue'] / transactions_per_customer['Products'],2)

## Data Preparation

In [46]:
data_prep = transactions_per_customer.copy()

In [47]:
data_prep=data_prep.dropna()

In [48]:
categorical_features = ['Country']
numerical_features = ['GrossRevenue', 'Items', 'AvarageTicket', "Products", 'Frequency', "Recency"]

#data_prep["Country"] = [0 if i == 'Norway' else 1 for i in data_prep["Country"]]

default_features = data_prep.columns
selected_features = ['GrossRevenue','Recency','Frequency']

data_prep=data_prep[selected_features].copy() 

In [49]:
log_columns = data_prep[selected_features].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
log_columns

GrossRevenue    21.575876
Frequency       11.342906
Recency          1.268542
dtype: float64

In [50]:
# The log transformations
for col in log_columns.index:
    data_prep[col] = np.log1p(data_prep[col])

In [51]:
# MinmmaxScaler provides better resutls mainly because is robbust to outliers
mms = MinMaxScaler()

for col in data_prep[selected_features]:
    data_prep[col] = mms.fit_transform(data_prep[[col]]).squeeze()
    
X = data_prep[selected_features].copy() 

In [52]:
clusters = X.copy()

## Model - Hierachical Cluster

In [53]:
k=10
hc = hierarchy.linkage(X, 'ward', metric='euclidean')
hc_labels = hierarchy.fcluster(hc, k, criterion='maxclust')
clusters["HierarchicalCluster"]=hc_labels

In [54]:
transactions_per_customer=transactions_per_customer.reset_index()

In [55]:
transactions_per_customer['Cluster'] = hc_labels

In [56]:
transactions = transactions.merge(transactions_per_customer[['CustomerID','Cluster']], on='CustomerID')

In [57]:
cluster_dict = {1:"Potential Loyalist",
                2:"About to Sleep",
                3:"Champion",
                4:"Loyal Costumer",
                5:"Promising",
                6:"Hinernating",
                7:"Cannot Lose Them",
                8:"At Risk",
                9:"New Customers",
                10:"Need Atention"}  

In [58]:
transactions_per_customer['Cluster']=transactions_per_customer.Cluster.map(cluster_dict)
transactions['Cluster']=transactions.Cluster.map(cluster_dict)

In [59]:
transactions_per_customer.isna().sum()

CustomerID       0
GrossRevenue     0
Recency          0
Frequency        0
Products         0
Items            0
Country          0
AvarageTicket    0
Cluster          0
dtype: int64

## Accessing Postgres Database on AWS

In [60]:
insiders = transactions_per_customer.copy()

In [61]:
insiders=insiders.drop(columns='Country', axis=1)

In [72]:
host = os.environ.get('POSTGRES_HOST')
port = os.environ.get('POSTGRES_PORT')
database = os.environ.get('POSTGRES_DATABASE')
user = os.environ.get('POSTGRES_USER')
pwd = os.environ.get('POSTGRES_PWD')

In [87]:
endpoint= f'postgresql://{user}:{pwd}@{host}:{port}/{database}'
        
db = create_engine(endpoint, poolclass=NullPool)
conn = db.connect()

### Using SQL Alchemy

In [75]:
query_drop_insiders = """ DROP TABLE insiders """
conn.execute( query_drop_insiders )

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x27015e78970>

In [77]:
query_create_table_insiders = """
        CREATE TABLE insiders (
            CustomerID               INTEGER,
            GrossRevenue             REAL,
            Recency                  INTEGER,
            Frequency                INTEGER,
            Products                 INTEGER,
            Items                    INTEGER,
            AvarageTicket            REAL,
            Cluster                  TEXT
           ) """

    conn.execute( query_create_table_insiders )
    print('Table insiders was created!')

Table insiders was created!


In [88]:
#check if table exists on Postgress
check_table = """ SELECT * FROM information_schema.tables WHERE table_name = 'insiders';"""
df_check = pd.read_sql_query(check_table, conn)
df_check

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,frist_test_db,public,insiders,BASE TABLE,,,,,,YES,NO,


In [78]:
insiders.to_sql('insiders', con=conn, if_exists='replace',index=False)

In [79]:
#consult database
query = """ SELECT * FROM insiders """
df = pd.read_sql_query(query, conn)
df

Unnamed: 0,CustomerID,GrossRevenue,Recency,Frequency,Products,Items,AvarageTicket,Cluster
0,12347,4310.00,3,7,182,2458,161.68,About to Sleep
1,12348,1797.24,76,4,27,2341,283.32,At Risk
2,12349,1757.55,19,1,73,631,24.08,Need Atention
3,12350,334.40,311,1,17,197,19.67,Hinernating
4,12352,1545.41,37,11,92,470,128.83,At Risk
...,...,...,...,...,...,...,...,...
4307,18280,180.60,278,1,10,45,18.06,Hinernating
4308,18281,80.82,181,1,7,54,11.55,Hinernating
4309,18282,176.60,8,3,13,98,28.44,Champion
4310,18283,2094.88,4,16,689,1397,48.63,About to Sleep


In [90]:
conn.close()