In [78]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

master_df = pd.read_csv("clean_master_dataset.csv",
                        parse_dates=['order_purchase_timestamp'])

In [80]:
# Decide Snapshot Date
snapshot_date = master_df['order_purchase_timestamp'].max() + pd.Timedelta(days=1)
snapshot_date

Timestamp('2018-08-30 15:00:37')

In [82]:
# Create RFM Table
# Group by customer_unique_id
rfm = master_df.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': lambda x: (snapshot_date - x.max()).days,
    'order_id': 'nunique',
    'payment_value': 'sum'
}).reset_index()

In [83]:
# Rename columns:
rfm.columns = ['customer_unique_id', 'recency', 'frequency', 'monetary']

In [84]:
rfm.head()

Unnamed: 0,customer_unique_id,recency,frequency,monetary
0,0000366f3b9a7992bf8c76cfdf3221e2,112,1,141.9
1,0000b849f77a49e4a4ce2b2a4ca5be3f,115,1,27.19
2,0000f46a3911fa3c0805444483337064,537,1,86.22
3,0000f6ccb0745a6a4b88665a16c9f078,321,1,43.62
4,0004aac84e0df4da2b147fca70cf8255,288,1,196.89


In [85]:
# Understand Distribution
rfm.describe()

Unnamed: 0,recency,frequency,monetary
count,93358.0,93358.0,93358.0
mean,237.941773,1.03342,165.945646
std,152.591453,0.209097,227.80868
min,1.0,1.0,0.0
25%,114.0,1.0,63.1
50%,219.0,1.0,107.9
75%,346.0,1.0,183.215
max,714.0,15.0,13664.08


In [90]:
# Create RFM Scores (1‚Äì5 Scale)
# Recency Score
#Lower recency = better customer
#So we reverse score.
rfm['R_score'] = pd.qcut(rfm['recency'], 5, labels=[5,4,3,2,1])

In [92]:
# Frequency Score
#Higher frequency = better
rfm['F_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5,
                         labels=[1,2,3,4,5])

In [94]:
# Monetary Score
# Higher spending = better
rfm['M_score'] = pd.qcut(rfm['monetary'], 5, labels=[1,2,3,4,5])

In [96]:
# Create Combined RFM Score
rfm['RFM_score'] = (
    rfm['R_score'].astype(str) +
    rfm['F_score'].astype(str) +
    rfm['M_score'].astype(str)
)

In [98]:
# Create Customer Segments
def segment(row):
    if row['R_score'] >= 4 and row['F_score'] >= 4:
        return "Champions"
    elif row['R_score'] >= 3 and row['F_score'] >= 3:
        return "Loyal Customers"
    elif row['R_score'] >= 4 and row['F_score'] <= 2:
        return "New Customers"
    elif row['R_score'] <= 2 and row['F_score'] >= 3:
        return "At Risk"
    else:
        return "Others"

rfm['segment'] = rfm.apply(segment, axis=1)

In [99]:
# Analyze Segment Distribution
rfm['segment'].value_counts()

segment
Others             22359
At Risk            22230
Loyal Customers    18824
New Customers      14984
Champions          14961
Name: count, dtype: int64

In [102]:
# Let‚Äôs calculate percentage
rfm['segment'].value_counts(normalize=True) * 100

segment
Others             23.949742
At Risk            23.811564
Loyal Customers    20.163243
New Customers      16.050044
Champions          16.025408
Name: proportion, dtype: float64

In [104]:
# Revenue by Segment
rfm.groupby('segment')['monetary'].sum().sort_values(ascending=False)

segment
At Risk            3746327.78
Others             3580648.46
Loyal Customers    3056818.66
Champions          2659245.21
New Customers      2449313.48
Name: monetary, dtype: float64

In [106]:
# Let‚Äôs Calculate Revenue Share %
segment_revenue = rfm.groupby('segment')['monetary'].sum()

(segment_revenue / segment_revenue.sum()) * 100

segment
At Risk            24.181786
Champions          17.164888
Loyal Customers    19.731144
New Customers      15.809822
Others             23.112360
Name: monetary, dtype: float64

In [108]:
rfm.to_csv("RFM_Table.csv", index=False)

üéØ Step 5 ‚Äî Strategic Business Insights
Here‚Äôs what your analysis suggests:

1Ô∏è‚É£ Retention Problem

3% repeat rate

High frequency mostly 1

Weak loyalty

2Ô∏è‚É£ Revenue Concentration Risk

At Risk customers = highest revenue contributors

If they churn ‚Üí revenue drops significantly.

3Ô∏è‚É£ Champions Segment

Champions are strong but not dominant.

Opportunity:

Create loyalty rewards

Personalized campaigns

Faster delivery for high-value customers

4Ô∏è‚É£ Marketplace Nature

Since this is Olist, a marketplace:

Customers may not return because:

They buy from different sellers

No strong brand loyalty

Competing platforms available

In [111]:
rfm.head()

Unnamed: 0,customer_unique_id,recency,frequency,monetary,R_score,F_score,M_score,RFM_score,segment
0,0000366f3b9a7992bf8c76cfdf3221e2,112,1,141.9,4,1,4,414,New Customers
1,0000b849f77a49e4a4ce2b2a4ca5be3f,115,1,27.19,4,1,1,411,New Customers
2,0000f46a3911fa3c0805444483337064,537,1,86.22,1,1,2,112,Others
3,0000f6ccb0745a6a4b88665a16c9f078,321,1,43.62,2,1,1,211,Others
4,0004aac84e0df4da2b147fca70cf8255,288,1,196.89,2,1,4,214,Others


In [113]:
final_df = master.merge(
    rfm[['customer_unique_id','recency','frequency','monetary','segment']],
    on='customer_unique_id',
    how='left'
)

NameError: name 'master' is not defined