# Data Preprossesing

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date, timedelta
from tabulate import tabulate
from IPython.display import HTML

### Loading Data

In [3]:
data_raw = pd.read_csv('../data/ecommerce.csv', encoding='iso-8859-1')
data = data_raw.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
 8   Unnamed: 8   0 non-null       float64
dtypes: float64(3), int64(1), object(5)
memory usage: 37.2+ MB


### Glossary

In [5]:
glossary = [['Columns', 'Meaning'],
            ['InvoiceNo', 'Unique Identifier of each transaction'],
            ['StockCode', 'Internal item code'],
            ['Description', 'Item description/resume'],
            ['Quantity', 'Quantity of each item per transaction'],
            ['InvoiceDate', 'The day of transaction'],
            ['UnitPrice', 'Product price per unit'],
            ['CustomerID', 'Unique Identifier of Customer'],
            ['Country', 'Customer\'s country of residence']
           ]
#print(tabulate(glossary, headers='firstrow', stralign='left', tablefmt='simple'))

### Dealing with missing values

In [6]:
data.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
Unnamed: 8     541909
dtype: int64

In [7]:
data = data.drop('Unnamed: 8', axis=1)
data = data.dropna(subset=['Description','CustomerID'])

In [8]:
# data["IsCancelled"]=np.where(data.InvoiceNo.apply(lambda l: l[0]=="C"), True, False)
# data.IsCancelled.value_counts() / data.shape[0] * 100 , data.IsCancelled.value_counts()
#data[data["InvoiceNo"].str.startswith("C")]

### Fixing Data Types

In [9]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d-%b-%y') #format='%m/%d/%Y %H:%M'
data['CustomerID'] = data['CustomerID'].astype(int)

In [10]:
data.InvoiceDate.min(), data.InvoiceDate.max()

(Timestamp('2016-11-29 00:00:00'), Timestamp('2017-12-07 00:00:00'))

### Dealing Bad Values

In [21]:
# data.loc[data['CustomerID'] == 12346] 
# data.loc[data['CustomerID'] == 16446] 

data = data[~data['CustomerID'].isin([12346, 16446])]
data = data.loc[~(data['UnitPrice'] < 0.04)]

In [22]:
#data["StockCode"].nunique()
#data.groupby("StockCode")["Description"].nunique().sort_values(ascending = False)

In [23]:
# for stack_code in data["StockCode"].unique():
#     first_description = data[data["StockCode"]==stack_code]["Description"].unique()[0]
#     data.loc[data["StockCode"]==stack_code, "Description"] = first_description

### Feature Engeenier

**Gross Revenue**
- Receita Bruta, calculado pela soma de todas as compras e devoluções referentes ao mesmo cliente

**RecencyDays**
- Receita Bruta, calculado pela diferença entre a ultima data registrada no dataset e a ultima compra realizada referente ao mesmo cliente

**Frequency**
- Frequencia que o cliente realiza compras, calculado pela soma de notas diferentes emitidas de notas emitidas, InvoiceNo, referente ao mesmo cliente

In [24]:
df2 = data.copy()

In [25]:
df_purchase = data.loc[data['Quantity'] >= 0]
df_returns = data.loc[data['Quantity'] < 0]

In [26]:
data_client = df2.drop(['InvoiceNo','StockCode','Description', 'Quantity','InvoiceDate','UnitPrice','Country'],axis=1).drop_duplicates(ignore_index=True)

In [27]:
last_day = data.InvoiceDate.max() + dt.timedelta(days = 1)

In [28]:
# GrossRevenue
df2['GrossRevenuePartial'] = df2['Quantity'] * df2['UnitPrice']
aux_revenue = df2[['CustomerID', 'GrossRevenuePartial']].groupby('CustomerID').sum().reset_index().rename(columns={'GrossRevenuePartial':'GrossRevenueTotal'})
data_client = pd.merge(data_client,aux_revenue, how='left',on='CustomerID')
len(data_client)

4369

In [29]:
# aux_rece = df_purchase[['CustomerID','InvoiceDate','InvoiceNo']].groupby("CustomerID").agg({"InvoiceDate": lambda x: (last_day - x.max()).days,
#                                          "InvoiceNo": "nunique"}).rename(columns = {"InvoiceDate": "RecencyDays",
#                             "InvoiceNo": "Frequency"})

# #aux_rece
# data_client = pd.merge(data_client, aux_rece[['RecencyDays','Frequency']], on ='CustomerID', how='left')
# # len(data_client)
# data_client.head()

In [30]:
# Recency - Last day purchase
aux_recency = df_purchase[['CustomerID','InvoiceDate']].groupby('CustomerID').max().reset_index()
#aux_recency['RecencyDays'] = (last_day - aux_recency['InvoiceDate']).dt.days
aux_recency['RecencyDays'] = (last_day - aux_recency['InvoiceDate']).dt.days



data_client = pd.merge(data_client, aux_recency[['CustomerID','RecencyDays']], on ='CustomerID', how='left')
len(data_client)

4369

In [31]:
# Frequency
aux_freq = df_purchase[['CustomerID','InvoiceNo']].drop_duplicates('InvoiceNo').groupby('CustomerID').count().reset_index().rename(columns={'InvoiceNo':'Frequency'})
data_client = pd.merge(data_client, aux_freq, on='CustomerID',how='left')
len(data_client)

4369

In [32]:
# # Avarage Ticket
# aux_ticket = df2[['CustomerID','GrossRevenuePartial']].groupby('CustomerID').mean().reset_index().rename(columns={'GrossRevenuePartial':'AvarageTicket'})
# data_client = pd.merge(data_client, aux_ticket,on='CustomerID',how='left')

In [33]:
# # Número de compras
# aux_prod = df_purchase.loc[:,['CustomerID', 'StockCode']].groupby('CustomerID').count().reset_index().rename(columns={'StockCode':'NumberProducts'})
# data_client = pd.merge(data_client, aux_prod, on='CustomerID', how='left')
# len(data_client)

In [34]:
# # Number Of Returns
# aux_return = df_returns[['CustomerID', 'Quantity']].groupby('CustomerID').sum().reset_index().rename(columns={'Quantity':'NumberReturn'})
# aux_return['NumberReturn'] = -1*aux_return['NumberReturn']
# aux_return['NumberReturn'] = aux_return['NumberReturn'].fillna(0)
# data_client = pd.merge(data_client, aux_return, on='CustomerID', how='left')

In [35]:
data_client = data_client.set_index('CustomerID')

In [38]:
#data_client.loc[data_client['NumberReturn'].isna(), 'NumberReturn'] = 0

In [39]:
data_client.sample(10)

Unnamed: 0_level_0,GrossRevenueTotal,RecencyDays,Frequency
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17566,901.21,9.0,2.0
14029,467.66,64.0,2.0
13630,1995.68,6.0,7.0
14139,415.82,47.0,1.0
18268,0.0,135.0,1.0
17912,295.61,311.0,5.0
14597,883.15,12.0,2.0
17802,1251.84,83.0,3.0
15854,2974.65,5.0,12.0
16318,328.15,36.0,1.0


In [40]:
data_client=data_client.dropna()

In [44]:
#Some clients has more returns then purchases because of date
data_client = data_client.loc[~(data_client['GrossRevenueTotal'] < 0.01)]

In [45]:
data_client

Unnamed: 0_level_0,GrossRevenueTotal,RecencyDays,Frequency
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17850,5288.63,373.0,34.0
13047,3079.10,32.0,10.0
12583,7187.34,3.0,15.0
13748,948.25,96.0,5.0
15100,635.10,334.0,3.0
...,...,...,...
13436,196.89,2.0,1.0
15520,343.50,2.0,1.0
13298,360.00,2.0,1.0
14569,227.39,2.0,1.0
