# Marketing campaign optimization/ predicting coupon redemption

American Express and Analytics Vidhya hosted a hackathon to predict coupon redemption. My full solution with a 10 CV mean AUC of 0.99+ is posted at https://github.com/balawillgetyou/dy/blob/master/AmexAV20191006Annotated.ipynb

The notebook below uses this example to show how to train a model &gt; create a docker container &gt; deploy as a webservice on Azure &gt; make predict calls to the the webservice

# Train

## Access configured AzureML workspace 

In [None]:
import azureml.core
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("This notebook was created using version 1.0.2 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")


ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

## Data loading and pre-processing 

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1) #to prevent cell display truncation
from collections import Counter

import joblib #to serialize/ de-serialize trained model and save/ load
import json #json of data needed to test webservice
import requests #to make calls to the webservice

#EDA, model performance and feature importance visualization
import matplotlib.pyplot as plt
import seaborn as sns

#missing values
from sklearn.impute import SimpleImputer
#from fancyimpute import KNN

#data final prep, classifier and performance checks
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.metrics.scorer import make_scorer

In [3]:
temp_1 = open("train.csv", 'r', encoding='latin-1') 
train = pd.read_csv(temp_1)
train.drop(['id'], axis=1, inplace=True)
train = train.apply(pd.Categorical)

temp_1 = open("customer_demographics.csv", 'r', encoding='latin-1') 
customer_demographics = pd.read_csv(temp_1)

temp_1 = open("customer_transaction_data.csv", 'r', encoding='latin-1') 
customer_transaction_data = pd.read_csv(temp_1)

temp_1 = open("campaign_data.csv", 'r', encoding='latin-1') 
campaign_data = pd.read_csv(temp_1)

temp_1 = open("coupon_item_mapping.csv", 'r', encoding='latin-1') 
coupon_item_mapping = pd.read_csv(temp_1)

temp_1 = open("item_data.csv", 'r', encoding='latin-1') 
item_data = pd.read_csv(temp_1)

In [4]:
campaign_data.start_date = pd.to_datetime(campaign_data.start_date) 
campaign_data.end_date = pd.to_datetime(campaign_data.end_date) 
campaign_data['couponValidityDays'] = (campaign_data.end_date - campaign_data.start_date)/ np.timedelta64(1, 'D')
campaign_data = campaign_data[campaign_data.couponValidityDays > 0 ]
print(campaign_data.describe())
print(campaign_data.shape)
print(campaign_data.head(23))

       campaign_id  couponValidityDays
count  23.000000    23.000000         
mean   16.000000    63.652174         
std    9.332251     41.477934         
min    1.000000     31.000000         
25%    8.500000     32.000000         
50%    17.000000    56.000000         
75%    23.500000    76.000000         
max    30.000000    182.000000        
(23, 5)
    campaign_id campaign_type start_date   end_date  couponValidityDays
0   24           Y            2013-10-21 2013-12-20  60.0              
1   25           Y            2013-10-21 2013-11-22  32.0              
2   20           Y            2013-07-09 2013-11-16  130.0             
3   23           Y            2013-08-10 2013-11-15  97.0              
4   21           Y            2013-09-16 2013-10-18  32.0              
5   22           X            2013-09-16 2013-10-18  32.0              
7   19           Y            2013-08-26 2013-09-27  32.0              
8   17           Y            2013-07-29 2013-08-30  32.0        

In [5]:
train.campaign_id = train.campaign_id.astype(str) 
campaign_data.campaign_id = campaign_data.campaign_id.astype(str) 

In [6]:
trainCampaign = pd.merge(train, campaign_data, on=['campaign_id'])
print(trainCampaign.isnull().sum())
print(trainCampaign.dtypes)
print(trainCampaign.shape)
print(trainCampaign.head())

campaign_id           0
coupon_id             0
customer_id           0
redemption_status     0
campaign_type         0
start_date            0
end_date              0
couponValidityDays    0
dtype: int64
campaign_id           object        
coupon_id             category      
customer_id           category      
redemption_status     category      
campaign_type         object        
start_date            datetime64[ns]
end_date              datetime64[ns]
couponValidityDays    float64       
dtype: object
(47815, 8)
  campaign_id coupon_id customer_id redemption_status campaign_type  \
0  9           635       205         0                 Y              
1  9           444       590         0                 Y              
2  9           575       108         0                 Y              
3  9           705       712         0                 Y              
4  9           19        608         0                 Y              

  start_date   end_date  couponValidityDays  
0

In [7]:
trainCampaign.coupon_id = train.coupon_id.astype(str) 
coupon_item_mapping.coupon_id = coupon_item_mapping.coupon_id.astype(str) 

In [8]:
trainCampaignCoupon = pd.merge(trainCampaign, coupon_item_mapping, on=['coupon_id'])
trainCampaignCoupon.sort_values('start_date')
print(trainCampaignCoupon.isnull().sum())
print(trainCampaignCoupon.dtypes)
print(trainCampaignCoupon.shape)
print(trainCampaignCoupon.head())

campaign_id           0
coupon_id             0
customer_id           0
redemption_status     0
campaign_type         0
start_date            0
end_date              0
couponValidityDays    0
item_id               0
dtype: int64
campaign_id           object        
coupon_id             object        
customer_id           category      
redemption_status     category      
campaign_type         object        
start_date            datetime64[ns]
end_date              datetime64[ns]
couponValidityDays    float64       
item_id               int64         
dtype: object
(4072163, 9)
  campaign_id coupon_id customer_id redemption_status campaign_type  \
0  9           27        205         0                 Y              
1  9           27        205         0                 Y              
2  9           27        205         0                 Y              
3  9           27        205         0                 Y              
4  9           27        205         0                 Y

In [9]:
trainCampaignCoupon.customer_id = trainCampaignCoupon.customer_id.astype(str) 
trainCampaignCoupon.item_id = trainCampaignCoupon.item_id.astype(str) 
#trainCampaignCoupon.start_date = pd.datetime(trainCampaignCoupon.start_date)
customer_transaction_data.customer_id = customer_transaction_data.customer_id.astype(str) 
customer_transaction_data.item_id = customer_transaction_data.item_id.astype(str) 
customer_transaction_data["date"] = pd.to_datetime(customer_transaction_data["date"])

In [10]:
trainCampaignCouponTrans = pd.merge(customer_transaction_data, trainCampaignCoupon, 'inner', on =['customer_id', 'item_id'] )

In [11]:
print(trainCampaignCouponTrans.shape)
print(trainCampaignCouponTrans.isnull().sum())
print(trainCampaignCouponTrans.dtypes)
display(trainCampaignCouponTrans.head())

(80291, 14)
date                  0
customer_id           0
item_id               0
quantity              0
selling_price         0
other_discount        0
coupon_discount       0
campaign_id           0
coupon_id             0
redemption_status     0
campaign_type         0
start_date            0
end_date              0
couponValidityDays    0
dtype: int64
date                  datetime64[ns]
customer_id           object        
item_id               object        
quantity              int64         
selling_price         float64       
other_discount        float64       
coupon_discount       float64       
campaign_id           object        
coupon_id             object        
redemption_status     category      
campaign_type         object        
start_date            datetime64[ns]
end_date              datetime64[ns]
couponValidityDays    float64       
dtype: object


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id,coupon_id,redemption_status,campaign_type,start_date,end_date,couponValidityDays
0,2012-01-02,679,19522,2,156.02,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0
1,2012-07-18,679,19522,1,78.01,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0
2,2013-02-01,679,19522,1,78.01,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0
3,2012-01-02,679,31235,1,177.74,-35.62,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0
4,2012-03-16,679,31235,1,224.05,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0


In [12]:
le_age_range = LabelEncoder()
customer_demographics['age_range'] = le_age_range.fit_transform(customer_demographics['age_range'].astype(str))

le_marital_status = LabelEncoder()
customer_demographics['marital_status'] = le_marital_status.fit_transform(customer_demographics['marital_status'].astype(str))

le_family_size = LabelEncoder()
customer_demographics['family_size'] = le_family_size.fit_transform(customer_demographics['family_size'].astype(str))

le_no_of_children = LabelEncoder()
customer_demographics['no_of_children'] = le_no_of_children.fit_transform(customer_demographics['no_of_children'].astype(str))


In [13]:
customer_demographicsSimImp = customer_demographics.apply(pd.Categorical)
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
customer_demographicsSimImp = pd.DataFrame(imp.fit_transform(customer_demographicsSimImp), columns=customer_demographicsSimImp.columns, index=customer_demographicsSimImp.index)
print(customer_demographicsSimImp.describe())
print(customer_demographicsSimImp.isnull().sum())
customer_demographicsSimImp.head()

customer_demographics_knn = customer_demographicsSimImp#while KNN has not been used for imputation, the object name is reused here for ease of reuse

        customer_id  age_range  marital_status  rented  family_size  \
count   760          760        760             760     760           
unique  760          6          3               2       5             
top     1023         3          2               0       1             
freq    1            271        329             719     303           

        no_of_children  income_bracket  
count   760             760             
unique  4               12              
top     3               5               
freq    538             187             
customer_id       0
age_range         0
marital_status    0
rented            0
family_size       0
no_of_children    0
income_bracket    0
dtype: int64


In [14]:
customer_demographics_knnMod = customer_demographics_knn.apply(pd.Categorical)
customer_demographics_knnMod.customer_id  = customer_demographics_knnMod.customer_id.astype(int)
customer_demographics_knnMod.dtypes
customer_demographics_knnMod.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,5,0,0,1,3,4
1,6,3,0,0,1,3,5
2,7,1,2,0,2,0,3
3,8,1,2,0,3,1,6
4,10,3,1,0,0,3,5


In [15]:
customer_demographics_knnMod.customer_id = customer_demographics_knnMod.customer_id.astype(str) 
trainCampaignCouponTrans.customer_id = trainCampaignCouponTrans.customer_id.astype(str) 
train.customer_id = customer_demographics_knnMod.customer_id.astype(str) 

In [16]:
trainCampaignCouponTransCust = pd.merge(trainCampaignCouponTrans, customer_demographics_knnMod,  'inner', on =['customer_id'] )

In [17]:
print(trainCampaignCouponTransCust.shape)
print(trainCampaignCouponTransCust.isnull().sum())
print(trainCampaignCouponTransCust.dtypes)
display(trainCampaignCouponTransCust.head())

(52319, 20)
date                  0
customer_id           0
item_id               0
quantity              0
selling_price         0
other_discount        0
coupon_discount       0
campaign_id           0
coupon_id             0
redemption_status     0
campaign_type         0
start_date            0
end_date              0
couponValidityDays    0
age_range             0
marital_status        0
rented                0
family_size           0
no_of_children        0
income_bracket        0
dtype: int64
date                  datetime64[ns]
customer_id           object        
item_id               object        
quantity              int64         
selling_price         float64       
other_discount        float64       
coupon_discount       float64       
campaign_id           object        
coupon_id             object        
redemption_status     category      
campaign_type         object        
start_date            datetime64[ns]
end_date              datetime64[ns]
couponValidity

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id,coupon_id,redemption_status,campaign_type,start_date,end_date,couponValidityDays,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,2012-01-02,679,19522,2,156.02,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0,2,1,1,1,3,4
1,2012-07-18,679,19522,1,78.01,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0,2,1,1,1,3,4
2,2013-02-01,679,19522,1,78.01,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0,2,1,1,1,3,4
3,2012-01-02,679,31235,1,177.74,-35.62,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0,2,1,1,1,3,4
4,2012-03-16,679,31235,1,224.05,0.0,0.0,30,23,0,X,2012-11-19,2013-04-01,133.0,2,1,1,1,3,4


In [18]:
trainCampaignCouponTransCust.item_id = trainCampaignCouponTransCust.item_id.astype(str) 
item_data.item_id = item_data.item_id.astype(str) 
trainCampaignCouponTransCustItem = pd.merge(trainCampaignCouponTransCust, item_data, 'left', on =['item_id'] )
print(trainCampaignCouponTransCustItem.shape)
print(trainCampaignCouponTransCustItem.isnull().sum())
print(trainCampaignCouponTransCustItem.dtypes)
display(trainCampaignCouponTransCustItem.head())

(52319, 23)
date                  0
customer_id           0
item_id               0
quantity              0
selling_price         0
other_discount        0
coupon_discount       0
campaign_id           0
coupon_id             0
redemption_status     0
campaign_type         0
start_date            0
end_date              0
couponValidityDays    0
age_range             0
marital_status        0
rented                0
family_size           0
no_of_children        0
income_bracket        0
brand                 0
brand_type            0
category              0
dtype: int64
date                  datetime64[ns]
customer_id           object        
item_id               object        
quantity              int64         
selling_price         float64       
other_discount        float64       
coupon_discount       float64       
campaign_id           object        
coupon_id             object        
redemption_status     category      
campaign_type         object        
start_date      

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id,coupon_id,redemption_status,...,couponValidityDays,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category
0,2012-01-02,679,19522,2,156.02,0.0,0.0,30,23,0,...,133.0,2,1,1,1,3,4,686,Established,Pharmaceutical
1,2012-07-18,679,19522,1,78.01,0.0,0.0,30,23,0,...,133.0,2,1,1,1,3,4,686,Established,Pharmaceutical
2,2013-02-01,679,19522,1,78.01,0.0,0.0,30,23,0,...,133.0,2,1,1,1,3,4,686,Established,Pharmaceutical
3,2012-01-02,679,31235,1,177.74,-35.62,0.0,30,23,0,...,133.0,2,1,1,1,3,4,57,Established,Pharmaceutical
4,2012-03-16,679,31235,1,224.05,0.0,0.0,30,23,0,...,133.0,2,1,1,1,3,4,57,Established,Pharmaceutical


In [19]:
trainCampaignCouponTransCustItem.campaign_id = trainCampaignCouponTransCustItem.campaign_id.astype('category') 
trainCampaignCouponTransCustItem.coupon_id = trainCampaignCouponTransCustItem.coupon_id.astype('category') 
trainCampaignCouponTransCustItem.brand = trainCampaignCouponTransCustItem.brand.astype('category') 

In [20]:
le_campaign_type = LabelEncoder()
trainCampaignCouponTransCustItem['campaign_type'] = le_campaign_type.fit_transform(trainCampaignCouponTransCustItem['campaign_type'].astype(str))
trainCampaignCouponTransCustItem.campaign_type = trainCampaignCouponTransCustItem.campaign_type.astype('category') 

le_brand_type = LabelEncoder()
trainCampaignCouponTransCustItem['brand_type'] = le_brand_type.fit_transform(trainCampaignCouponTransCustItem['brand_type'].astype(str))
trainCampaignCouponTransCustItem.brand_type = trainCampaignCouponTransCustItem.brand_type.astype('category') 

le_category = LabelEncoder()
trainCampaignCouponTransCustItem['category'] = le_category.fit_transform(trainCampaignCouponTransCustItem['category'].astype(str))
trainCampaignCouponTransCustItem.category = trainCampaignCouponTransCustItem.category.astype('category') 

In [21]:
trainX = trainCampaignCouponTransCustItem.drop(['redemption_status', 'date', 'start_date', 'end_date'], axis=1)
trainY = trainCampaignCouponTransCustItem.redemption_status

X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.15, random_state=66)

In [22]:
#y_train.values.reshape(-1,)[0:9]
#display(y_train.head())
#display(X_train.head())
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(44471, 19)
(44471,)
(7848, 19)
(7848,)


##  Model training

### Single model training 

In [24]:
# Get an experiment object from Azure Machine Learning
experiment = Experiment(workspace=ws, name="AmexMarketing3")

# Create a run object in the experiment
run =  experiment.start_logging()

numTrees = 100

#train classifier
rfc = RandomForestClassifier(n_estimators=numTrees, oob_score=True)
rfc.fit(X_train,y_train.values.ravel())

# Coupon redemption prediction
y_predict = rfc.predict(X_test)

# Probability of coupon redemption  produced by the model
y_prob = [probs[1] for probs in rfc.predict_proba(X_test)]

# Confusion matrix test set
pd.DataFrame(
            confusion_matrix(y_test, y_predict),
            columns=['Predicted Not', 'Predicted Redeemed Coupon'],
            index=['Predicted Not', 'Predicted Redeemed Coupon']
        )

# Compute area under the curve, print to notebook and add to the run
fpr, tpr, _ = roc_curve(y_test.values.ravel(), y_prob)
roc_auc = auc(fpr, tpr)
        
# log trees, AUC in run history
run.log(name="tree", value=numTrees)
run.log(name="AUC", value=roc_auc)
        
# Save the model to the outputs directory for capture
joblib.dump(value=rfc, filename='AmexMarketingModel.pkl')

['AmexMarketingModel.pkl']

## Viewing run results 

In [25]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
AmexMarketing3,ff61a698-9212-4780-ae6e-aae3d2b7941d,,Running,Link to Azure Portal,Link to Documentation


# Deploy

## Register a model

In [30]:
from azureml.core.model import Model
# Register the model with the workspace
ws = Workspace.from_config()
model = Model.register(workspace = ws, model_name='AmexMarketingbest_model', model_path='AmexMarketingModel.pkl')

Registering model AmexMarketingbest_model


In [31]:
# Find all models called xxxx and display their version numbers
models = Model.list(ws, name='AmexMarketingbest_model')
for m in models:
    print(m.name, m.version)

AmexMarketingbest_model 1


## Scoring file: load a model using init() and then apply the model to new data using run()

In [32]:
%%writefile score.py
import pickle
import json
import numpy as np
from sklearn.externals import joblib
from azureml.core.model import Model

def init():
    global model
    # this call should return the path to the serialized model
    model_path = Model.get_model_path(model_name="AmexMarketingbest_model")
    # deserialize the model file back into a sklearn model
    model = joblib.load(model_path)
    
# note you can pass in multiple rows for scoring
def run(raw_data):
    try:
        data = json.loads(raw_data)['data']
        data = np.array(data)
        result = model.predict(data)

        # you can return any data type as long as it is JSON-serializable
        return result.tolist()
    except Exception as e:
        result = str(e)
        return result

Overwriting score.py


## myenv.yml: Instructions to contstruct a docker image 

In [33]:
from azureml.core.conda_dependencies import CondaDependencies 
from azureml.core.image import ContainerImage

# Create an empty conda environment and add the scikit-learn package
env = CondaDependencies()
env.add_conda_package("scikit-learn")

# Display the environment
print(env.serialize_to_string())

# Write the environment to disk
with open("myenv.yml","w") as f:
    f.write(env.serialize_to_string())

# Create a configuration object indicating how our deployment container needs to be created
image_config = ContainerImage.image_configuration(execution_script="score.py", 
                                    runtime="python", 
                                    conda_file="myenv.yml")

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults

- scikit-learn
channels:
- conda-forge



## The target compute¶

In [34]:
from azureml.core.webservice import AciWebservice

aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               tags={'name': 'Amex Marketing'}, 
                                               description='Amex Marketing coupon redemption prediction.')

## Deploy the webservice 

In [35]:
%%time
from azureml.core.webservice import Webservice

# Create the webservice using all of the precreated configurations and our best model
service = Webservice.deploy_from_model(name='my-amex-v2',
                                       deployment_config=aciconfig,
                                       models=[model],
                                       image_config=image_config,
                                       workspace=ws)

# Wait for the service deployment to complete while displaying log output
service.wait_for_deployment(show_output=True)

Creating image
Running...............................
Succeeded
Image creation operation finished for image my-amex-v2:2, operation "Succeeded"
Creating service
Running.............
SucceededACI service creation operation finished, operation "Succeeded"
CPU times: user 952 ms, sys: 532 ms, total: 1.48 s
Wall time: 4min 8s


## Test the webservice 

### Access the HTTP endpoint of the service and call it using standard POST operations. 

In [41]:
#subset of records from the minority class y=1
indices1 = y_test.index
display(X_test.loc[X_test.index.isin(indices1)].head())
testSampleY1 = X_test.loc[X_test.index == 41950]
testSampleY1Json = json.dumps({"data": testSampleY1.as_matrix().tolist()})
display(testSampleY1Json)#view sample of minority class y=1 as JSON
#view sample of majority class y=0
testSampleY0 = X_test.iloc[0:1,]
testSampleY0Json = json.dumps({"data": testSampleY0.as_matrix().tolist()})
display(testSampleY0Json)

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id,coupon_id,campaign_type,couponValidityDays,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category
11451,1367,22850,1,44.52,-18.88,0.0,30,21,0,133.0,2,2,0,0,3,5,56,1,4
3512,1192,46016,2,27.78,0.0,0.0,30,21,0,133.0,5,2,0,0,3,2,56,1,4
24579,1490,13440,1,195.55,-42.74,0.0,10,22,1,62.0,1,1,0,0,3,5,1558,0,4
36382,1203,27942,1,113.63,-39.18,0.0,30,705,0,133.0,2,1,0,0,3,3,714,0,4
34401,952,4535,1,71.24,-13.89,0.0,8,438,0,77.0,3,0,0,3,1,5,132,0,4




'{"data": [["8", "30304", 1, 95.82, 0.0, 0.0, "8", "6", 0, 77.0, 1, 2, 0, 3, 1, 6, 119, 0, 4]]}'



'{"data": [["1367", "22850", 1, 44.52, -18.88, 0.0, "30", "21", 0, 133.0, 2, 2, 0, 0, 3, 5, 56, 1, 4]]}'

In [42]:
# create the required header
headers = {'Content-Type':'application/json'}

service = ws.webservices['my-amex-v2']

# post the request to the service and display the result for a minority class sample
resp = requests.post(service.scoring_uri, testSampleY1Json, headers = headers)
print(resp.text)

# post the request to the service and display the result for a majority class sample
resp = requests.post(service.scoring_uri, testSampleY0Json, headers = headers)
print(resp.text)

[1]
[0]
