In [1]:
import pandas as pd
import numpy as np
import datetime as dt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load cleaned data
df = pd.read_csv('../data/cleaned_data.csv')

### RFM Features

#### 1. Recency (R)

In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Create new column InvoiceDay
df['InvoiceDay'] = df['InvoiceDate'].dt.date
df['InvoiceDay'] = pd.to_datetime(df['InvoiceDay'])

# Find most recent transacted date of each customer
customer_data = df.groupby('CustomerID')['InvoiceDay'].max().reset_index()

# Find the most recent date from the whole dataset a.k.a the "now" date for recency calculation:
max_dt = df['InvoiceDay'].max()
max_dt = pd.to_datetime(max_dt)

# Calculate Recency a.k.a Days_Since_Last_Transaction
customer_data['Days_Since_Last_Transaction'] = (max_dt - customer_data['InvoiceDay']).dt.days

# Finally, drop the InvoiceDay column
customer_data.drop('InvoiceDay', axis=1, inplace=True)

# Display the first 5 rows
display(customer_data.head())

Unnamed: 0,CustomerID,Days_Since_Last_Transaction
0,12346,325
1,12347,2
2,12348,75
3,12349,18
4,12350,310


#### 2. Frequency (F)

In [4]:
# Calculate the total of each customer's number of orders (using unique InvoiceNo):
order_counts = df.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
order_counts.rename(columns={'InvoiceNo': 'Total_Orders_Count'}, inplace=True)

# Calculate the total of each customer's number of product bought:
product_counts = df.groupby('CustomerID')['Quantity'].sum().reset_index()
product_counts.rename(columns={'Quantity': 'Total_Prod_Purchased'}, inplace=True)

# Join 2 dataframes above to customer_data:
customer_data = customer_data.merge(order_counts, on='CustomerID')
customer_data = customer_data.merge(product_counts, on='CustomerID')

display(customer_data.head())

Unnamed: 0,CustomerID,Days_Since_Last_Transaction,Total_Orders_Count,Total_Prod_Purchased
0,12346,325,2,0
1,12347,2,7,2458
2,12348,75,4,2332
3,12349,18,1,630
4,12350,310,1,196


#### 3. Monetary (M)

In [5]:
# Creating new row: Total_Spent
df['Total_Spent'] = df['UnitPrice'] * df['Quantity']
customer_spent = df.groupby('CustomerID')['Total_Spent'].sum().reset_index()

# Calculate average order value = Total_Spent / Total_Orders_Count: 
aov = customer_spent.merge(order_counts, on='CustomerID')
aov['Avg_Order_Value'] = (aov['Total_Spent'] / aov['Total_Orders_Count']).round(3)

# Merge to customer_data:
customer_data = customer_data.merge(customer_spent, on='CustomerID')
customer_data = customer_data.merge(aov[['CustomerID', 'Avg_Order_Value']], on='CustomerID')

display(customer_data.head())

Unnamed: 0,CustomerID,Days_Since_Last_Transaction,Total_Orders_Count,Total_Prod_Purchased,Total_Spent,Avg_Order_Value
0,12346,325,2,0,0.0,0.0
1,12347,2,7,2458,4310.0,615.714
2,12348,75,4,2332,1437.24,359.31
3,12349,18,1,630,1457.55,1457.55
4,12350,310,1,196,294.4,294.4


### Product Diversity

#### Product Diversity in Customer's Orders:

In [6]:
# Count number of unique products bought per customer: 
unique_product_counts = df.groupby('CustomerID')['StockCode'].nunique().reset_index()
unique_product_counts.columns = ['CustomerID', 'Uniq_Prod_Purchased_Count']

# Merge to customer_data:
customer_data = customer_data.merge(unique_product_counts, on='CustomerID')

display(customer_data.head())

Unnamed: 0,CustomerID,Days_Since_Last_Transaction,Total_Orders_Count,Total_Prod_Purchased,Total_Spent,Avg_Order_Value,Uniq_Prod_Purchased_Count
0,12346,325,2,0,0.0,0.0,1
1,12347,2,7,2458,4310.0,615.714,103
2,12348,75,4,2332,1437.24,359.31,21
3,12349,18,1,630,1457.55,1457.55,72
4,12350,310,1,196,294.4,294.4,16


### Purchase Behavioral Features

#### Avg. Days between Purchases

In [9]:
# Get day of week from InvoiceDate:
df['Day_of_Week'] = df['InvoiceDate'].dt.dayofweek


In [16]:
df['InvoiceDay']

0        2010-12-01
1        2010-12-01
2        2010-12-01
3        2010-12-01
4        2010-12-01
            ...    
399568   2011-12-09
399569   2011-12-09
399570   2011-12-09
399571   2011-12-09
399572   2011-12-09
Name: InvoiceDay, Length: 399573, dtype: datetime64[ns]

In [36]:
days_btw_purchase = df.groupby('CustomerID')['InvoiceDay'].apply(lambda x: (x.diff().dropna()).apply(lambda y: y.days))
avg_days_btw_purchase = days_btw_purchase.groupby('CustomerID').mean().reset_index()

  days_btw_purchase = df.groupby('CustomerID')['InvoiceDay'].apply(lambda x: (x.diff().dropna()).apply(lambda y: y.days))


In [None]:
avg_days_btw_purchase.rename(columns={'InvoiceDay': 'Avg_Days_Btw_Purchase'}, inplace=True)
avg_days_btw_purchase['Avg_Days_Btw_Purchase'] = avg_days_btw_purchase['Avg_Days_Btw_Purchase'].round(3)

In [39]:
avg_days_btw_purchase.sort_values(by='Avg_Days_Btw_Purchase', ascending=False)

Unnamed: 0,CustomerID,Avg_Days_Btw_Purchase
548,13068,309.0
4136,18084,285.0
4133,18080,224.0
414,12875,219.0
1769,14777,177.0
...,...,...
1632,14585,0.0
1635,14589,0.0
1589,14528,0.0
3272,16850,0.0


In [41]:
df[df['CustomerID'] == 14585]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,InvoiceDay,Total_Spent,Day_of_Week
300785,572088,23295,SET OF 12 MINI LOAF BAKING CASES,2,2011-10-20 14:33:00,0.83,14585,United Kingdom,1.66,2011-10-20,1.66,3
300786,572088,23293,SET OF 12 FAIRY CAKE BAKING CASES,2,2011-10-20 14:33:00,0.83,14585,United Kingdom,1.66,2011-10-20,1.66,3
300787,572088,23483,HANGING BUTTERFLY T-LIGHT HOLDER,6,2011-10-20 14:33:00,1.25,14585,United Kingdom,7.5,2011-10-20,7.5,3
300788,572088,22367,CHILDRENS APRON SPACEBOY DESIGN,1,2011-10-20 14:33:00,1.95,14585,United Kingdom,1.95,2011-10-20,1.95,3
300789,572088,22899,CHILDREN'S APRON DOLLY GIRL,1,2011-10-20 14:33:00,2.1,14585,United Kingdom,2.1,2011-10-20,2.1,3
300790,572088,22586,FELTCRAFT HAIRBAND PINK AND BLUE,1,2011-10-20 14:33:00,0.85,14585,United Kingdom,0.85,2011-10-20,0.85,3
300791,572088,22587,FELTCRAFT HAIRBAND RED AND BLUE,1,2011-10-20 14:33:00,0.85,14585,United Kingdom,0.85,2011-10-20,0.85,3
300792,572088,22566,FELTCRAFT HAIRBAND PINK AND PURPLE,2,2011-10-20 14:33:00,0.85,14585,United Kingdom,1.7,2011-10-20,1.7,3
300793,572088,22565,FELTCRAFT HAIRBANDS PINK AND WHITE,2,2011-10-20 14:33:00,0.85,14585,United Kingdom,1.7,2011-10-20,1.7,3
300794,572088,23368,SET 12 COLOUR PENCILS DOLLY GIRL,2,2011-10-20 14:33:00,0.65,14585,United Kingdom,1.3,2011-10-20,1.3,3


#### Favorite Shopping Day


#### Favorite Shopping Hour of Day