Conduct RFM (recency, frequency, monetary value) analysis on political donors in the UK

In [91]:
#imports
import pandas as pd
import datetime as dt 
import numpy as np

# DATA PROCESSING

In [92]:
#read data
df = pd.read_csv('data/Donations accepted by political parties.csv')
df.shape

  df = pd.read_csv('data/Donations accepted by political parties.csv')


(65278, 29)

In [93]:

#drop all nans from the columns that we will be using
df = df.dropna(subset=['Value', 'DonorId', 'ReceivedDate'])
df.shape

(62492, 29)

In [94]:
#change received data from string to datetime
df['ReceivedDate'] = pd.to_datetime(df['ReceivedDate'], format='%d/%m/%Y')
df['Value'] = df['Value'].replace('[\£,]', '', regex=True).astype(float)



# RFM ANALYSIS 

In [95]:
#Compute recency values
recency_df = df.groupby(by='DonorId', as_index=False)['ReceivedDate'].max()
recency_df.columns = ['DonorId', 'LastPurchaseDate']
recent_date = recency_df['LastPurchaseDate'].max()
recency_df['Recency'] = recency_df['LastPurchaseDate'].apply(
    lambda x: (recent_date - x).days)
recency_df.head

<bound method NDFrame.head of        DonorId LastPurchaseDate  Recency
0          1.0       2001-05-14     6685
1          2.0       2003-08-11     5866
2          3.0       2015-02-18     1657
3          4.0       2010-12-31     3167
4          5.0       2009-12-15     3548
...        ...              ...      ...
42183  87473.0       2019-08-05       28
42184  87478.0       2018-11-01      305
42185  87479.0       2009-12-29     3534
42186  87480.0       2017-03-13      903
42187  87482.0       2014-02-06     2034

[42188 rows x 3 columns]>

In [96]:
#Compute Frequency 
frequency_df = df.drop_duplicates().groupby(
    by=['DonorId'], as_index=False)['ReceivedDate'].count()
frequency_df.columns = ['DonorId', 'Frequency']
frequency_df.head

<bound method NDFrame.head of        DonorId  Frequency
0          1.0          2
1          2.0          2
2          3.0          3
3          4.0         10
4          5.0          3
...        ...        ...
42183  87473.0          1
42184  87478.0          2
42185  87479.0          1
42186  87480.0          1
42187  87482.0          1

[42188 rows x 2 columns]>

In [97]:
#Compute Monetary Value
monetary_df = df.groupby(by='DonorId', as_index=False)['Value'].sum()
monetary_df.columns = ['DonorId', 'Monetary']
monetary_df.head

<bound method NDFrame.head of        DonorId   Monetary
0          1.0   53000.00
1          2.0  118680.84
2          3.0   26625.00
3          4.0   59495.89
4          5.0    7700.00
...        ...        ...
42183  87473.0    4000.00
42184  87478.0    3529.41
42185  87479.0   11520.00
42186  87480.0  157350.07
42187  87482.0   50000.00

[42188 rows x 2 columns]>

In [98]:
#merge R, F & M values into one df
rfm = pd.merge(recency_df, frequency_df, on='DonorId')
rfm = pd.merge(rfm, monetary_df, on='DonorId')

In [127]:
# Define scoring criteria for each RFM value
recency_scores = [5, 4, 3, 2, 1]
frequency_scores = [1, 2, 3, 4, 5]
monetary_scores = [1, 2, 3, 4, 5]
recency_weight = 1
frequency_weight = 1
monetary_weight = 1

In [128]:
# Calculate RFM scores
rfm['RecencyScore'] = pd.cut(rfm['Recency'], bins=len(recency_scores), labels=recency_scores)
rfm['FrequencyScore'] = pd.cut(rfm['Frequency'], bins=len(frequency_scores), labels=frequency_scores)
rfm['MonetaryScore'] = pd.cut(rfm['Monetary'], bins=len(monetary_scores), labels=monetary_scores)
# Convert RFM scores to numeric type
rfm[['RecencyScore', 'FrequencyScore', 'MonetaryScore']] = rfm[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].astype(int)

In [129]:
#Calculate RFM score
rfm['RFM_Score'] = recency_weight*rfm['RecencyScore'] + frequency_weight*rfm['FrequencyScore'] + monetary_weight*rfm['MonetaryScore']

#Segment donors based on score
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
rfm['RFM_Segment'] = pd.qcut(rfm['RFM_Score'], q=3, labels=segment_labels)

In [131]:
# Create a new column for RFM Customer Segments
rfm['RFM_Customer_Segment'] = ''
# Assign RFM segments based on the RFM score
rfm.loc[rfm['RFM_Score'] >= 9, 'RFM_Customer_Segment'] = 'Champions'
rfm.loc[(rfm['RFM_Score'] >= 6) & (rfm['RFM_Score'] < 9), 'RFM_Customer_Segment'] = 'Potential Loyalists'
rfm.loc[(rfm['RFM_Score'] >= 5) & (rfm['RFM_Score'] < 6), 'RFM_Customer_Segment'] = 'At Risk Customers'
rfm.loc[(rfm['RFM_Score'] >= 4) & (rfm['RFM_Score'] < 5), 'RFM_Customer_Segment'] = "Can't Lose"
rfm.loc[(rfm['RFM_Score'] >= 3) & (rfm['RFM_Score'] < 4), 'RFM_Customer_Segment'] = "Lost"

In [132]:
rfm

Unnamed: 0,DonorId,LastPurchaseDate,Recency,Frequency,Monetary,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score,RFM_Segment,RFM_Customer_Segment
0,1.0,2001-05-14,6685,2,53000.00,1,1,1,3,Low-Value,Lost
1,2.0,2003-08-11,5866,2,118680.84,1,1,1,3,Low-Value,Lost
2,3.0,2015-02-18,1657,3,26625.00,4,1,1,6,High-Value,Potential Loyalists
3,4.0,2010-12-31,3167,10,59495.89,3,1,1,5,Mid-Value,At Risk Customers
4,5.0,2009-12-15,3548,3,7700.00,3,1,1,5,Mid-Value,At Risk Customers
...,...,...,...,...,...,...,...,...,...,...,...
42183,87473.0,2019-08-05,28,1,4000.00,5,1,1,7,High-Value,Potential Loyalists
42184,87478.0,2018-11-01,305,2,3529.41,5,1,1,7,High-Value,Potential Loyalists
42185,87479.0,2009-12-29,3534,1,11520.00,3,1,1,5,Mid-Value,At Risk Customers
42186,87480.0,2017-03-13,903,1,157350.07,5,1,1,7,High-Value,Potential Loyalists
