In [45]:
%run "part01_preprocessing.ipynb"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
 8   Unnamed: 8   0 non-null       float64
dtypes: float64(3), int64(1), object(5)
memory usage: 37.2+ MB


# Content

**Data Preparation**
   - Normalization
   - Standardization
   - Dimensonality
   - Feature Selection
   - Dealing with Outliers

# Data Preparation

In [46]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.decomposition import PCA
from umap.umap_ import UMAP
from sklearn.manifold import TSNE

In [47]:
data_prep = transactions_per_customer.copy()

In [48]:
data_prep.isna().sum()

GrossRevenue     0
Recency          0
Frequency        0
Products         0
Items            0
Country          0
AvarageTicket    0
dtype: int64

In [49]:
data_prep=data_prep.dropna()

## Enconding

In [51]:
categorical_features = ['Country']
numerical_features = ['GrossRevenue', 'Items', 'AvarageTicket', "Products", 'Frequency', "Recency"]

#data_prep["Country"] = [0 if i == 'Norway' else 1 for i in data_prep["Country"]]

default_features = data_prep.columns
selected_features = ['GrossRevenue','Recency','Frequency']

data_prep=data_prep[selected_features].copy() 

## Normalization

In [53]:
#sns.pairplot(data_prep, aspect=1.5);

In [54]:
log_columns = data_prep[selected_features].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
log_columns

GrossRevenue    21.526419
Frequency       11.401617
Recency          1.265656
dtype: float64

In [56]:
# The log transformations
for col in log_columns.index:
    data_prep[col] = np.log1p(data_prep[col])

## Standardization

In [57]:
#ss = StandardScaler()
#rs = RobustScaler()
#pt = PowerTransformer()

# MinmmaxScaler provides better resutls mainly because is robbust to outliers
mms = MinMaxScaler()

for col in data_prep[selected_features]:
    data_prep[col] = mms.fit_transform(data_prep[[col]]).squeeze()

In [58]:
#sns.pairplot(X, aspect=1.5);

## Feature Selection

In [66]:
X = data_prep[selected_features].copy() 

## Dimensonality Reduction

In [67]:
clusters_results = X.copy()

In [68]:
# Dimensionality Reduction(to 2D) using technique UMAP 
umap = UMAP(random_state=3456)
umap_embedding = umap.fit_transform(X)

# X,y axis representation for UMAP
clusters_results['umap_x'] = umap_embedding[:,0]
clusters_results['umap_y'] = umap_embedding[:,1]

In [69]:
# plt.figure(figsize=(7,4))
# plt.title('UMAP', fontsize='14',fontweight='bold')
#sns.scatterplot(data=clusters_results, x='umap_x', y='umap_y')

In [70]:
# Dimensionality Reduction(to 2D) using technique TSNE
tsne = TSNE(n_components=2, init='pca', learning_rate='auto', n_jobs=-1, random_state=3456)
tsne_embedding = tsne.fit_transform(X)

# X,y axis representation using TSNE
clusters_results['tsne_x'] = tsne_embedding[:,0]
clusters_results['tsne_y'] = tsne_embedding[:,1]

In [71]:
# plt.figure(figsize=(7,4))
# plt.title('UMAP', fontsize='14',fontweight='bold')
#sns.scatterplot(data=clusters_results, x='tsne_x', y='tsne_y')

## Feature Selection

The clustering algorithms do not have automated mechanisms to detect the best features, since in essence they only make groupings according to given variables, looking for similarities between them to form groups. Whether the groups formed are good or bad or have a good explanation for business problems is up to human beings to interpret this. As one of the objectives of this study is to compare the machine learning model with the statistical model, initially we will use the features "GrossRevenueTotal, "RecencyDays", "Frequency" to make a more balanced comparison.

## Dealing with Outliers

Later we will use a clustering algorithm, DBScan, which, in addition to being a classifier, is also an outlier detector.

In [72]:
clusters_results

Unnamed: 0_level_0,GrossRevenue,Recency,Frequency,umap_x,umap_y,tsne_x,tsne_y
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12347,0.626986,0.132437,0.288816,-2.299536,-3.566794,54.697453,-23.716745
12348,0.528773,0.697519,0.190897,-4.051661,8.500636,7.008690,45.856831
12349,0.530027,0.439947,0.000000,6.949862,-0.962272,9.251925,-47.448906
12350,0.387159,0.964859,0.000000,17.407061,2.067133,-53.508801,-12.480809
12352,0.517389,0.562584,0.313355,-5.185906,5.693970,33.226147,41.026989
...,...,...,...,...,...,...,...
18280,0.343631,0.943499,0.000000,16.877010,3.908441,-59.632496,-23.558453
18281,0.272299,0.861874,0.000000,14.414660,4.827289,-56.932262,-39.103912
18282,0.341638,0.287379,0.144408,-0.478079,2.491563,20.828018,-9.063582
18283,0.562208,0.175072,0.445855,-3.442676,-3.381583,63.295494,-15.541344
