In [351]:
# Dependencies
import numpy as np
import pandas as pd
import datetime as dt

In [352]:
# Read in df
df_main = pd.read_csv('../Resources/raw/ecommerce_customer_data_custom_ratios.csv')
df_main.head()

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn
0,46251,2020-09-08 09:38:32,Electronics,12,3,740,Credit Card,37,0.0,Christine Hernandez,37,Male,0
1,46251,2022-03-05 12:56:35,Home,468,4,2739,PayPal,37,0.0,Christine Hernandez,37,Male,0
2,46251,2022-05-23 18:18:01,Home,288,2,3196,PayPal,37,0.0,Christine Hernandez,37,Male,0
3,46251,2020-11-12 13:13:29,Clothing,196,1,3509,PayPal,37,0.0,Christine Hernandez,37,Male,0
4,13593,2020-11-27 17:55:11,Home,449,1,3452,Credit Card,49,0.0,James Grant,49,Female,1


In [353]:
# Convert Purchase Date object to Datetime
df_main['Purchase Date'] = pd.to_datetime(df_main['Purchase Date'])

In [354]:
# Explore dataset
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Customer ID            250000 non-null  int64         
 1   Purchase Date          250000 non-null  datetime64[ns]
 2   Product Category       250000 non-null  object        
 3   Product Price          250000 non-null  int64         
 4   Quantity               250000 non-null  int64         
 5   Total Purchase Amount  250000 non-null  int64         
 6   Payment Method         250000 non-null  object        
 7   Customer Age           250000 non-null  int64         
 8   Returns                202404 non-null  float64       
 9   Customer Name          250000 non-null  object        
 10  Age                    250000 non-null  int64         
 11  Gender                 250000 non-null  object        
 12  Churn                  250000 non-null  int6

In [355]:
# Aggregate customer data
df_customers = df_main.groupby('Customer ID').agg({'Product Category': 'unique', # Category Types
                                                   'Purchase Date': 'count', # For total transactions
                                                   'Product Price': 'sum',
                                                   'Quantity': 'sum',
                                                   'Payment Method': 'unique',
                                                   'Customer Age': 'mean',
                                                   'Returns': 'sum',
                                                   'Gender': 'unique',
                                                   'Churn': 'sum',}).reset_index()

In [356]:
# Rename Columns 
df_customers.columns = ['Customer ID', 
                        'Shopping Categories',
                        'Checkout Instances',
                        'Total Purchase Value',
                        'Total Items Purchased',
                        'Payment Methods',
                        'Customer Age',
                        'Return Counts',
                        'Gender',
                        'Churn Rates']

In [357]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49673 entries, 0 to 49672
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Customer ID            49673 non-null  int64  
 1   Shopping Categories    49673 non-null  object 
 2   Checkout Instances     49673 non-null  int64  
 3   Total Purchase Value   49673 non-null  int64  
 4   Total Items Purchased  49673 non-null  int64  
 5   Payment Methods        49673 non-null  object 
 6   Customer Age           49673 non-null  float64
 7   Return Counts          49673 non-null  float64
 8   Gender                 49673 non-null  object 
 9   Churn Rates            49673 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 3.8+ MB


In [358]:
# Add min max dates
min_shopping = df_main.groupby('Customer ID').agg({'Purchase Date': 'min'}).reset_index()
max_shopping = df_main.groupby('Customer ID').agg({'Purchase Date': 'max'}).reset_index()

min_shopping.columns = ['Customer ID', 'Min Date']
max_shopping.columns = ['Customer ID', 'Max Date']

In [359]:
# Joins
shopping_pattern = pd.merge(min_shopping, max_shopping, on='Customer ID', how='left')
df_customers_main = pd.merge(df_customers, shopping_pattern, on='Customer ID', how='left')

In [360]:
# Add Tenure tab
df_customers_main['Tenure'] = df_customers_main['Max Date']- df_customers_main['Min Date']
df_customers_main['Tenure'] = df_customers_main['Tenure'].dt.days

In [361]:
df_customers_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49673 entries, 0 to 49672
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Customer ID            49673 non-null  int64         
 1   Shopping Categories    49673 non-null  object        
 2   Checkout Instances     49673 non-null  int64         
 3   Total Purchase Value   49673 non-null  int64         
 4   Total Items Purchased  49673 non-null  int64         
 5   Payment Methods        49673 non-null  object        
 6   Customer Age           49673 non-null  float64       
 7   Return Counts          49673 non-null  float64       
 8   Gender                 49673 non-null  object        
 9   Churn Rates            49673 non-null  int64         
 10  Min Date               49673 non-null  datetime64[ns]
 11  Max Date               49673 non-null  datetime64[ns]
 12  Tenure                 49673 non-null  int64         
dtypes

In [362]:
# Reorganize columns
df_customers_main = df_customers_main[['Customer ID', 
                                       'Gender', 
                                       'Customer Age', 
                                       'Checkout Instances', 
                                       'Churn Rates', 
                                       'Tenure', 
                                       'Return Counts',
                                       'Shopping Categories', 
                                       'Checkout Instances', 
                                       'Total Purchase Value', 
                                       'Total Items Purchased', 
                                       'Payment Methods']]

In [363]:
# Reinterpret Columns as String Type
df_customers_main = df_customers_main.convert_dtypes('string')
df_customers_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49673 entries, 0 to 49672
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Customer ID            49673 non-null  Int64 
 1   Gender                 49673 non-null  object
 2   Customer Age           49673 non-null  Int64 
 3   Checkout Instances     49673 non-null  Int64 
 4   Churn Rates            49673 non-null  Int64 
 5   Tenure                 49673 non-null  Int64 
 6   Return Counts          49673 non-null  Int64 
 7   Shopping Categories    49673 non-null  object
 8   Checkout Instances     49673 non-null  Int64 
 9   Total Purchase Value   49673 non-null  Int64 
 10  Total Items Purchased  49673 non-null  Int64 
 11  Payment Methods        49673 non-null  object
dtypes: Int64(9), object(3)
memory usage: 5.0+ MB


In [364]:
df_customers_main['Gender']

0          [Male]
1        [Female]
2        [Female]
3          [Male]
4        [Female]
           ...   
49668    [Female]
49669      [Male]
49670    [Female]
49671    [Female]
49672      [Male]
Name: Gender, Length: 49673, dtype: object

In [365]:
# Encode Gender
df_customers_main['EGender'] = np.where(df_customers_main['Gender'] == '{Female}', 0 ,1)


In [366]:
df_customers_main['EGender']

0        1
1        1
2        1
3        1
4        1
        ..
49668    1
49669    1
49670    1
49671    1
49672    1
Name: EGender, Length: 49673, dtype: int64