# <div style="color:white;display:fill;border-radius:5px;background-color:#0E2031;letter-spacing:0.5px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Content</p></div>     
     
**Data Preprocessing**  
   - Settings
   - Loading Data
   - Glossary
   - Dealing with missing Values
   - Fixing Data Types
   - Dealing Bad Values
   - Feature Engieenier

# <div style="color:white;display:fill;border-radius:5px;background-color:#0E2031;letter-spacing:0.5px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Data Preprocessing</p></div>   

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date, timedelta
from tabulate import tabulate
from IPython.display import HTML
import dataframe_image as di
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def jupyter_table_settings():
    pd.set_option('display.expand_frame_repr', False )
    pd.set_option('display.max_columns', 30)
    pd.set_option('display.max_rows', 30)

In [3]:
def matplotlib_settings():
    """ Optimize general settings, standardize plot sizes, etc. """
    
#     %matplotlib inline
#     plt.style.use( 'ggplot' )
#     plt.rcParams['figure.figsize'] = [12, 6]
#     plt.rcParams['font.size'] = 20

In [4]:
def summary_stats(df):
    # central tendency: mean, median  
    numerical_features = df.select_dtypes(include = ['int64','float64']).columns.values
    df = df[numerical_features]
    
    count_ = pd.DataFrame(df.apply(np.size))
    #pct_ = pd.DataFrame(df.apply(np.size)/len(df)*100)
    sum_ = pd.DataFrame(df.apply(np.sum))
    mean_ = pd.DataFrame(df.apply(np.mean))
    std_ = pd.DataFrame(df.apply(np.std))
    median_ = pd.DataFrame(df.apply(np.median))
    
    min_ = pd.DataFrame(df.apply(min))
    max_ = pd.DataFrame(df.apply(max))
    range_ = pd.DataFrame(df.apply(lambda x: x.max() - x.min()))
    q1_ = pd.DataFrame(df.apply(lambda x: np.percentile(x,25)))
    q3_ = pd.DataFrame(df.apply(lambda x: np.percentile(x,75)))
    
    #skewness_= pd.DataFrame(df.apply(lambda x: x.skew()))
    #kurtosis_ = pd.DataFrame(df.apply(lambda x: x.kurtosis()))
    
    summary = pd.concat([count_, mean_, std_, median_, sum_, min_, max_, range_, q1_,q3_],axis=1)
    summary.columns = ['counts','mean','std','median','sum','min','max', 'range', '25%','75%']
 

    cm = sns.light_palette("#2D7DAB", as_cmap = True)
    
    display(summary.style.format('{:.2f}').background_gradient(subset=(summary.index[1:],summary.columns[:]),cmap=cm))

In [5]:
jupyter_table_settings()

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Loading Data</p></div>

In [6]:
data_raw = pd.read_csv('../data/ecommerce.csv', encoding='iso-8859-1')
data_raw.shape

(541909, 9)

In [7]:
data = data_raw.copy()

In [8]:
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 8
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom,
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom,
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom,
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom,
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,7-Dec-17,0.85,12680.0,France,
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,7-Dec-17,2.10,12680.0,France,
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,7-Dec-17,4.15,12680.0,France,
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,7-Dec-17,4.15,12680.0,France,


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
 8   Unnamed: 8   0 non-null       float64
dtypes: float64(3), int64(1), object(5)
memory usage: 37.2+ MB


## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Glossary</p></div>

In [10]:
glossary = [['Columns', 'Meaning'],
            ['InvoiceNo', 'Unique Identifier of each transaction'],
            ['StockCode', 'Internal item code'],
            ['Description', 'Item description/resume'],
            ['Quantity', 'Quantity of each item per transaction'],
            ['InvoiceDate', 'The day of transaction'],
            ['UnitPrice', 'Product price per unit'],
            ['CustomerID', 'Unique Identifier of Customer'],
            ['Country', 'Customer\'s country of residence']
           ]
#print(tabulate(glossary, headers='firstrow', stralign='left', tablefmt='simple'))

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Dealing with missing values</p></div>

In [11]:
data.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
Unnamed: 8     541909
dtype: int64

In [12]:
data = data.drop('Unnamed: 8', axis=1)
data = data.dropna(subset=['CustomerID'])#'Description',

As the purpose of this project is to group customers, then it makes no sense to classify unidentified customers. To simplify the study, we will initially ignore unidentified customers, which are those who purchased but we do not know who they are because at the time of purchase, he or she was not a registered user or was not informed at the time of purchase.

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Fixing data types</p></div>

In [13]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%b-%y') #format='%m/%d/%Y %H:%M'
data['CustomerID'] = data['CustomerID'].astype(int)

data['Total'] = data['Quantity'] * data['UnitPrice']

In [14]:
data.InvoiceDate.min(), data.InvoiceDate.max()

(Timestamp('2016-11-29 00:00:00'), Timestamp('2017-12-07 00:00:00'))

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Dealing with bad values</p></div>

"Bad values" were considered for this project (whose objective is to classify customers): Absurd purchases followed by cancellations, purchases close to or below zero, the company owing the customer, this due to the temporal cut of the database not containing the request of purchase only the cancellation.

These values will therefore be excluded. They can even be useful in the EDA stage to generate insights, but for the machine learning model they significantly interfere with performance.

Another most observed value are products with the same "Stock Code" but different "Description". As we do not know the correct description and this feature will not be used in this project, we will leave it as is.

In [15]:
sum_transactions_per_client=data[['CustomerID','Total','Quantity']].groupby('CustomerID').agg({'Total':np.sum,
                                                                                               'Quantity':np.sum
                                                                                              }).reset_index()

In [18]:
sum_transactions_per_client.sort_values(by='Total')

Unnamed: 0,CustomerID,Total,Quantity
3756,17448,-4287.63,-1
2236,15369,-1592.49,-1
1384,14213,-1192.20,-244
3870,17603,-1165.30,-31
125,12503,-1126.00,-1
...,...,...,...
55,12415,123725.45,77242
1895,14911,132572.62,77180
3758,17450,187482.17,69029
4233,18102,256438.49,64122


In [20]:
#Customers who do not have a positive purchase balance or who owe the company (due to the temporal cut of the database) will be excluded

bad_clients = sum_transactions_per_client.loc[(sum_transactions_per_client['Total'] <= 0.5) | (sum_transactions_per_client['Quantity'] <= 1)]

In [21]:
bad_clients

Unnamed: 0,CustomerID,Total,Quantity
0,12346,0.000000e+00,0
89,12454,5.684342e-14,0
125,12503,-1.126000e+03,-1
127,12505,-4.500000e+00,-1
170,12558,1.065814e-14,0
...,...,...,...
4256,18133,7.155000e+02,0
4261,18141,-3.540000e+01,-12
4350,18256,-5.010000e+01,-70
4358,18268,0.000000e+00,0


In [16]:
list_bad_clients=bad_clients['CustomerID'].tolist()
data = data[~data['CustomerID'].isin(list_bad_clients)]

In [17]:
data = data.loc[~(data['UnitPrice'] < 0.04)]

In [18]:
data[data.StockCode.str.contains("^[a-zA-Z]")].StockCode.value_counts()

POST            1196
M                434
C2               134
D                 75
DOT               16
CRUK              16
BANK CHARGES      10
Name: StockCode, dtype: int64

In [19]:
data[data.StockCode.str.contains("^[a-zA-Z]")].Description.value_counts()

POSTAGE            1196
Manual              434
CARRIAGE            134
Discount             75
DOTCOM POSTAGE       16
CRUK Commission      16
Bank Charges         10
Name: Description, dtype: int64

In [20]:
list_letter_stock=data[data.StockCode.str.contains("^[a-zA-Z]")].StockCode.value_counts().index.tolist()

In [21]:
# Take only products without letters on StockCode
data = data.loc[~data['StockCode'].isin(list_letter_stock)]

In [22]:
data.groupby("StockCode")["Description"].nunique()[data.groupby("StockCode")["Description"].nunique() != 1]

StockCode
16156L    2
17107D    3
20622     2
20725     2
20914     2
         ..
85184C    2
85185B    2
90014A    2
90014B    2
90014C    2
Name: Description, Length: 213, dtype: int64

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Feature Engeenier</p></div>

In [31]:
transactions = data.copy()

In [32]:
#Group InvoiceNumber, it contains sales and cancelations

transactions=data.groupby('InvoiceNo').agg( CustomerID = ('CustomerID', np.unique),
                                            InvoiceDate = ('InvoiceDate', np.unique),
                                            Total = ('Total', 'sum'),
                                            UniqueProducts = ('StockCode', 'nunique'), 
                                            Items = ('Quantity', 'sum'),
                                            Country = ('Country', np.unique),
                                            ProductsCode = ('StockCode', np.unique)).reset_index()

transactions['AvarageTicket']= round(transactions['Total']/transactions['UniqueProducts'],2) 
#len(transactions)

In [33]:
#transactions.InvoiceNo.str.contains("C").value_counts()

In [34]:
transactions

Unnamed: 0,InvoiceNo,CustomerID,InvoiceDate,Total,UniqueProducts,Items,Country,ProductsCode,AvarageTicket
0,536365,17850,2016-11-29,139.12,7,40,United Kingdom,"[21730, 22752, 71053, 84029E, 84029G, 84406B, ...",19.87
1,536366,17850,2016-11-29,22.20,2,12,United Kingdom,"[22632, 22633]",11.10
2,536367,13047,2016-11-29,278.73,12,83,United Kingdom,"[21754, 21755, 21777, 22310, 22622, 22623, 227...",23.23
3,536368,13047,2016-11-29,70.05,4,15,United Kingdom,"[22912, 22913, 22914, 22960]",17.51
4,536369,13047,2016-11-29,17.85,1,3,United Kingdom,21756,17.85
...,...,...,...,...,...,...,...,...,...
21700,C581470,17924,2017-12-06,-8.32,1,-4,United Kingdom,23084,-8.32
21701,C581484,16446,2017-12-07,-168469.60,1,-80995,United Kingdom,23843,-168469.60
21702,C581490,14397,2017-12-07,-32.53,2,-23,United Kingdom,"[22178, 23144]",-16.26
21703,C581568,15311,2017-12-07,-54.75,1,-5,United Kingdom,21258,-54.75


In [27]:
last_day = data.InvoiceDate.max() + dt.timedelta(days = 1)

transactions_per_customer = transactions.groupby('CustomerID').agg(
                                                      GrossRevenue = ('Total', np.sum),                                           
                                                      Recency = ('InvoiceDate', lambda x: ((last_day - x.max()).days)),             
                                                      Frequency = ('InvoiceNo', 'count'),             
                                                      Products = ('UniqueProducts', 'sum'), 
                                                      Items = ('Items', 'sum'),  
                                                      Country = ('Country', np.unique))
                                                      #AvarageTicket = ('AvarageTicket', 'sum') ,           
                                                      #Products = ('StockCode', np.unique),                                                                   
    
#transactions_per_customer['AvarageTicket']= round(transactions_per_customer['GrossRevenue'] / transactions_per_customer['Products'],2)

In [28]:
transactions_per_customer

Unnamed: 0_level_0,GrossRevenue,Recency,Frequency,Products,Items,Country
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12347,4310.00,3,7,182,2458,Iceland
12348,1437.24,76,4,23,2332,Finland
12349,1457.55,19,1,72,630,Italy
12350,294.40,311,1,16,196,Norway
12352,1265.41,37,8,84,463,Norway
...,...,...,...,...,...,...
18280,180.60,278,1,10,45,United Kingdom
18281,80.82,181,1,7,54,United Kingdom
18282,176.60,8,3,13,98,United Kingdom
18283,2088.93,4,16,687,1395,United Kingdom


In [29]:
summary_stats(transactions_per_customer)

Unnamed: 0,counts,mean,std,median,sum,min,max,range,25%,75%
GrossRevenue,4312.0,1922.38,8310.57,656.0,8289313.15,2.9,278778.02,278775.12,300.9,1612.29
Recency,4312.0,91.07,99.35,50.0,392675.0,1.0,374.0,373.0,17.0,139.0
Frequency,4312.0,5.03,9.13,3.0,21705.0,1.0,242.0,241.0,1.0,5.0
Products,4312.0,91.42,226.17,41.0,394214.0,1.0,7602.0,7601.0,17.75,99.0
Items,4312.0,1135.64,4699.46,374.5,4896867.0,1.0,196556.0,196555.0,159.0,979.75
