Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
210 lines (157 sloc) 7.58 KB
# __________________________________________________________________
# //////////////////////////////////////////////////////////////////
#
# Author - Anupama Rajaram
# Program Description - Marketing related functions, including
# a. Compute RFM (recency, frequency and monetary value)
# b. Dendograms & Statistical/ hierarchical segmentation
# c. Managerial (customer) segmentation into categories
# like inactive, warm, cold etc. based on recency
# __________________________________________________________________
# //////////////////////////////////////////////////////////////////
# To clean up the memory of your current R session run the following line
rm(list=ls(all=TRUE))
#===================================================================#
#=========== Section 1: Import data and clean/re-arrange============#
# Load text file into local variable called 'data'
# this dataset had 51243 rows and 3 columns
data = read.delim(file = 'purchases.txt', header = FALSE,
sep = '\t', dec = '.')
# Rename the column headers from generic V1, V2,.. to meaningful titles
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
# this commnad
# interpret the last column as a date, and extract year of purchase
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
# the as.Date specifies date in a particular format
# now we can calculate individual components (date, month or year)
# to be used at a later time.
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Display the data after transformation
head(data)
summary(data)
#===================================================================#
#=========== Section 2: Compute RFM, with SQL help==================#
# Compute key marketing indicators - recency, frequency and monetary
# value, using SQL queries within R
# please make sure the slqdf library is installed already
library(sqldf)
# Compute recency, frequency, and average purchase amount
customers = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
# Note, new dataset has 18417 rows and 4 columns, since some users
# purchased products more than once
# Explore the data - optional
head(customers)
summary(customers)
# Graphical visualization of RFM
hist(customers$recency)
hist(customers$frequency)
hist(customers$amount)
hist(customers$amount, breaks = 100)
#===================================================================#
#=========== Section 3: Statistical segmentation ===================#
# Preparing the data
# First, Copy customer data into new data frame
new_data = customers
# Remove customer id as a variable, store it as row names
head(new_data)
row.names(new_data) = new_data$customer_id
new_data$customer_id = NULL
head(new_data)
# Take the log-transform of the amount, and plot
new_data$amount = log(new_data$amount)
hist(new_data$amount)
# Standardize variables
new_data = scale(new_data)
head(new_data)
# Second, run statistical segmentation
# most laptops do not have enough memory to compute distance metrics
# for the entire dataset.
# to check try the command below (commented here)
# d = dist(new_data)
# If your computer throws an error message use the 30% sample below
# If not, replace "by = 3" with "by = 1"
# We take a 30% sample - hence dividing the original set by 3
# For 10% sample, divide by 10
# For 50% sample, divide by 2, and so on.
sample = seq(1, 18417, by = 3)
head(sample)
customers_sample = customers[sample, ]
new_data_sample = new_data[sample, ]
# Compute distance metrics on standardized data
d = dist(new_data_sample)
# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")
# Plot dendogram
plot(c)
# Divide customers into 15 segments
members = cutree(c, k = 15)
# Show 100 first customers and their segment number
# this is an exploratory step, hence optional
members[1:100]
table(members)
# Show average/ aggregrated profile (average revenue) of each segment
aggregate(customers_sample[, 2:4], by = list(members), mean)
#===================================================================#
#=========== Section 4: Managerial/ customer segmentation ==========#
# statistical segmentations are not viable for businesses since they
# become obsolete very quickly, with changes in data
# Managerial segmentation is a much better alternative.
# First copy customers data
customers_2015 = customers
# ------------Simple 2-segment solution based on recency alone------------#
# adding a column called segment.
# if customers bought products >3 years, then mark as inactive.
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3,
yes = "inactive", no = "NA")
# exploratory step (optional)
table(customers_2015$segment)
# note, table() uses cross-classifying factors to build a contingency table
# of the counts at each combination of factor levels.
# In this case, the classification factor = segment => inactive or NA
# ------------ Alternative 6-segment solution based on ----------#
# ------------ recency & monetary value -------------------------#
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 &
customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 &
customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" &
customers_2015$amount < 100)] =
"warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" &
customers_2015$amount >= 100)] =
"warm high value"
customers_2015$segment[which(customers_2015$segment == "active"
& customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" &
customers_2015$amount < 100)] =
"active low value"
customers_2015$segment[which(customers_2015$segment == "active" &
customers_2015$amount >= 100)] =
"active high value"
# Re-order factor in a way that makes sense
customers_2015$segment = factor(x = customers_2015$segment,
levels = c("inactive", "cold",
"warm high value",
"warm low value",
"new warm", "active high value",
"active low value", "new active"
))
# exploring these new segments
table(customers_2015$segment)
aggregate(x = customers_2015[, 2:4], by = list(customers_2015$segment), mean)
# using a colored pie chart to view the segments
pie(table(customers_2015$segment), col = rainbow(24))
aggregate(x = customers_2015[, 2:4], by = list(customers_2015$segment),
mean)