### **Import Dependencies**

In [14]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.colors
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
pio.templates.default = 'plotly_white'

### **Data Loading**

In [2]:
df = pd.read_csv('./data/rfm_data.csv')
print(df.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location  
0     Tokyo  
1    London  
2  New York  
3    London  
4     Paris  


### **Data Preprocessing**

In [3]:
print(df.isna().sum())

CustomerID            0
PurchaseDate          0
TransactionAmount     0
ProductInformation    0
OrderID               0
Location              0
dtype: int64


`Calculatig RMF Values`

In [4]:
# Convert 'PurchaseDate to Datetime format
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate']).dt.normalize()

# Calculate Recency 
today_date = datetime.now().date()
df['Recency'] = (pd.to_datetime(today_date) - df['PurchaseDate']).dt.days

# Calculate Frequency
freq = df.groupby('CustomerID')['OrderID'].count().reset_index()
freq.rename(columns={'OrderID': 'Frequency'}, inplace=True)

# Calculate Monetary Value
money = df.groupby('CustomerID')['TransactionAmount'].sum().reset_index()
money.rename(columns={'TransactionAmount': 'MonetaryValue'}, inplace=True)

# Merge frequency and monetary value while removing duplicates
df = df.merge(freq, on='CustomerID', how='left')
df = df.merge(money, on='CustomerID', how='left')

print(df.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location  Recency  Frequency  MonetaryValue  
0     Tokyo      563          1         943.31  
1    London      563          1         463.70  
2  New York      563          1          80.28  
3    London      563          1         221.29  
4     Paris      563          1         739.56  


`Calculating RFM Scores`

In [5]:
# Define scoring criteria for each RFM value
recency_scores = [5, 4, 3, 2, 1]     # Higher score for lower recency (more recent)
frequency_scores = [1, 2, 3, 4, 5]
monetary_scores = [1, 2, 3, 4, 5]

# Calculate RFM scores
df['RecencyScore'] = pd.cut(df['Recency'], bins=5, labels=recency_scores)
df['FrequencyScore'] = pd.cut(df['Frequency'], bins=5, labels=frequency_scores)
df['MoneytaryScore'] = pd.cut(df['MonetaryValue'], bins=5, labels=monetary_scores)

# Convert RFM scores to numeric type
df['RecencyScore'] = df['RecencyScore'].astype(int)
df['FrequencyScore'] = df['FrequencyScore'].astype(int)
df['MoneytaryScore'] = df['MoneytaryScore'].astype(int)

print(df.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location  Recency  Frequency  MonetaryValue  RecencyScore  FrequencyScore  \
0     Tokyo      563          1         943.31             1               1   
1    London      563          1         463.70             1               1   
2  New York      563          1          80.28             1               1   
3    London      563          1         221.29             1               1   
4     Paris      563          1         739.56             1               1   

   MoneytaryScore  
0               2  
1               

`RFM Value Segmentation`

In [6]:
# Calculate RMF Score by combining the individual scores
df['RMF_Score'] = df['RecencyScore'] + df['FrequencyScore'] + df['MoneytaryScore']

# Create RMF segments based on the RMF Scores
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
df['Value_Segment'] = pd.qcut(df['RMF_Score'], q=3, labels=segment_labels)

print(df.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location  Recency  Frequency  MonetaryValue  RecencyScore  FrequencyScore  \
0     Tokyo      563          1         943.31             1               1   
1    London      563          1         463.70             1               1   
2  New York      563          1          80.28             1               1   
3    London      563          1         221.29             1               1   
4     Paris      563          1         739.56             1               1   

   MoneytaryScore  RMF_Score Value_Segment  
0          

### **Data Visualization**

In [7]:
# RMF Segment Distribution
segment_counts = df['Value_Segment'].value_counts().reset_index()
segment_counts.columns = ['Value_Segment', 'Count']

pastel_colors = px.colors.qualitative.Pastel

# Create the Bar Chart
fig_segment_dist = px.bar(
    segment_counts, x='Value_Segment', y='Count',
    color='Value_Segment', color_discrete_sequence=pastel_colors,
    title='RMF Value Distrbution'
)
# Update the layout
fig_segment_dist.update_layout(
    xaxis_title='RMF Value Segment',
    yaxis_title='Count',
    showlegend=False
)
# Show the figure
fig_segment_dist.show()


These segments are determined by dividing RFM scores into distinct ranges or groups, allowing for a more granular analysis of overall customer RFM characteristics. The RFM value segment helps us understand the relative value of customers in terms of recency, frequency, and monetary aspects.

`RMF Analysis In Broader Perspective`

In [8]:
df['RMF_Customer_Segments'] = ''

# Assign RMF segments based on the RMF Score
df.loc[df['RMF_Score'] >= 9, 'RMF_Customer_Segments'] = 'Champions'
df.loc[(df['RMF_Score'] >= 6) & (df['RMF_Score'] < 9), 'RMF_Customer_Segments'] = 'Potential Loyalists'
df.loc[(df['RMF_Score'] >= 5) & (df['RMF_Score'] < 6), 'RMF_Customer_Segments'] = 'At Risk Customers'
df.loc[(df['RMF_Score'] >= 4) & (df['RMF_Score'] < 5), 'RMF_Customer_Segments'] = "Can't Lose"
df.loc[(df['RMF_Score'] >= 3) & (df['RMF_Score'] < 4), 'RMF_Customer_Segments'] = "Lost"

# Print the updated df with RFM segments
print(df[['CustomerID', 'RMF_Customer_Segments']])

     CustomerID RMF_Customer_Segments
0          8814            Can't Lose
1          2188                  Lost
2          4608                  Lost
3          2559                  Lost
4          9482            Can't Lose
..          ...                   ...
995        2970   Potential Loyalists
996        6669   Potential Loyalists
997        8836   Potential Loyalists
998        1440   Potential Loyalists
999        4759   Potential Loyalists

[1000 rows x 2 columns]


In [9]:
print(df['Value_Segment'].value_counts())
print(df['RMF_Customer_Segments'].value_counts())


Value_Segment
Low-Value     435
Mid-Value     386
High-Value    179
Name: count, dtype: int64
RMF_Customer_Segments
Potential Loyalists    503
At Risk Customers      180
Can't Lose             173
Lost                    82
Champions               62
Name: count, dtype: int64


In [10]:
segment_product_counts = df.groupby(['Value_Segment', 'RMF_Customer_Segments']).size().reset_index(name='Count')
segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

fig_treemap = px.treemap(
    segment_product_counts,
    path=['Value_Segment', 'RMF_Customer_Segments'],
    values='Count',
    color='Value_Segment', color_discrete_sequence=pastel_colors,
    title='RFM Customer Segments by Value'
)
fig_treemap.show()

In [12]:
# Champions Segment
champ_segment = df[df['RMF_Customer_Segments'] == 'Champions']

fig = go.Figure()
fig.add_trace(go.Box(y=champ_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=champ_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=champ_segment['MoneytaryScore'], name='Monetary'))

fig.update_layout(
    title='Distribution of RFM Values within Champions Segment',
    yaxis_title='RFM Value',
    showlegend=True
)

fig.show()

`Correlation`

In [13]:
corrr_matrix = champ_segment[['RecencyScore', 'FrequencyScore', 'MoneytaryScore']].corr()

fig_heatmap = go.Figure(
    data=go.Heatmap(
        z=corrr_matrix.values,
        x=corrr_matrix.columns,
        y=corrr_matrix.columns,
        colorscale='RdBu',
        colorbar=dict(title='Correlation')
    )
)

fig_heatmap.update_layout(
    title='Correlation Matrix of RMF Values with Champions Segment'
)
fig_heatmap.show()

In [19]:
segment_counts = df['RMF_Customer_Segments'].value_counts()

# Bar Chart
fig = go.Figure(
    data=[
        go.Bar(x=segment_counts.index,
               y=segment_counts.values,
               marker=dict(color=pastel_colors))
    ]
)

# Highlight the Champions Segment
champ_color = 'rgb(34, 139, 34)'
fig.update_traces(
    marker_color=[
        champ_color if segment == 'Champions' else pastel_colors[i]
        for i, segment in enumerate(segment_counts.index)],
    marker_line_color='rgb(8, 48, 107)',
    marker_line_width=1.5, opacity=0.6
)

# Update the Layout
fig.update_layout(
    title='Comparison of RMF Segments',
    xaxis_title='RMF Segments',
    yaxis_title='Number of Customers',
    showlegend=False
)

fig.show()

In [25]:
# Calculate the average Recency, Frequency, and Monetary scores for each segment
segment_scores = df.groupby('RMF_Customer_Segments')[['RecencyScore', 'FrequencyScore', 'MoneytaryScore']].mean().reset_index()

# Create a grouped bar chart to compare segment scores
fig = go.Figure()

# Add bars for Recency score
fig.add_trace(go.Bar(
    x=segment_scores['RMF_Customer_Segments'],
    y=segment_scores['RecencyScore'],
    name='Recency Score',
    marker_color='rgb(98, 149, 132)'
))

# Add bars for Frequency score
fig.add_trace(go.Bar(
    x=segment_scores['RMF_Customer_Segments'],
    y=segment_scores['FrequencyScore'],
    name='Frequency Score',
    marker_color='rgb(56, 116, 120)'
))

# Add bars for Monetary score
fig.add_trace(go.Bar(
    x=segment_scores['RMF_Customer_Segments'],
    y=segment_scores['MoneytaryScore'],
    name='Monetary Score',
    marker_color='rgb(36, 54, 66)'
))

# Update the layout
fig.update_layout(
    title='Comparison of RFM Segments based on Recency, Frequency, and Monetary Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',
    showlegend=True
)

fig.show()

### **Summary**
RFM Analysis is used to understand and segment customers based on their buying behaviour. RFM stands for recency, frequency, and monetary value, which are three key metrics that provide information about customer engagement, loyalty, and value to a business.