In [1]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [2]:
portfolio['offer_id']=portfolio.index

map_offer = dict()
map_offer_type = dict()
for old_id, new_id, _type in zip(portfolio['id'],portfolio['offer_id'],portfolio['offer_type']):
    map_offer[old_id]=new_id
    map_offer_type[new_id]=_type

In [3]:
gender_non = profile['gender'].isnull().sum()
gender_o = profile['gender'].value_counts()['O']
gender_m = profile['gender'].value_counts()['M']
gender_f = profile['gender'].value_counts()['F']
print('#Male:{}, \n#Female:{}, \n#Other:{}, \n#Unknown:{}'.format(gender_m,gender_f,gender_o,gender_non))

#Fill the 'None' in gender column with string 'U', meaning gender not provided
#This is not necessarily the same group as 'O', assuming it means self gender identification as non-binary
profile['gender'].fillna('U', inplace=True)

#Male:8484, 
#Female:6129, 
#Other:212, 
#Unknown:2175


In [4]:
def age_group(x):
    if x<20:
        grp = '<20'
    elif (x>=20) & (x<25):
        grp = '20-25'
    elif (x>=25) & (x<30):
        grp = '25-30'
    elif (x>=30) & (x<35):
        grp = '30-35'
    elif (x>=35) & (x<40):
        grp = '35-40'        
    elif (x>=40) & (x<45):
        grp = '40-45'
    elif (x>=45) & (x<50):
        grp = '45-50'        
    elif (x>=50) & (x<55):
        grp = '50-55'
    elif (x>=55) & (x<60):
        grp = '55-60'    
    elif (x>=60) & (x<65):
        grp = '60-65'
    elif (x>=65) & (x<70):
        grp = '65-70'
    elif (x>=70) & (x<80):
        grp = '70-80'
    elif (x>=80) & (x<=101):
        grp = '80-101'
    else:
        grp = '>101'
    return grp

profile['age_group'] = profile['age'].apply(lambda x: age_group(x))

In [5]:
#define membership group

def membership_group(x):
    if x<20140101:
        grp = 'before_2014'
    elif (x>=20140101) & (x<20150101):
        grp = 'since_2014'
    elif (x>=20150101) & (x<20160101):
        grp = 'since_2015'
    elif (x>=20160101) & (x<20170101):
        grp = 'since_2016'
    elif (x>=20170101) & (x<20180101):
        grp = 'since_2017'
    else:
        grp = 'since_2018'
    return grp

profile['membership_group'] = profile['became_member_on'].apply(lambda x: membership_group(x))

In [6]:
def income_group(x):
    if x<40000:
        grp = '<40k'
    elif (x>=40000) & (x<60000):
        grp = '40-60k'
    elif (x>=60000) & (x<80000):
        grp = '60-80k'
    elif (x>=80000) & (x<100000):
        grp = '80-100k'
    elif x>=100000:
        grp = '>100k'
    else:
        grp = 'unknown'
    return grp

profile['income_group'] = profile['income'].apply(lambda x: income_group(x))

In [7]:
profile['user_id']=profile.index

# create id mapping to clean up id columns in transcript dataframe
map_user = dict()
for old_id, new_id in zip(profile['id'],profile['user_id']):
    map_user[old_id]=new_id

In [8]:
def event_value(x):
    if x.event == 'transaction':
        value = x['value'].get('amount')
    elif x.event == 'offer completed':
        value = x['value'].get('offer_id')
    else:
        value = x['value'].get('offer id')
    return value

transcript['event_value']=transcript.apply(lambda x: event_value(x), axis=1)

transcript['person'] = transcript['person'].apply(lambda x: map_user[x])

transcript_offer = transcript[transcript['event']!='transaction'].copy()
transcript_offer['offer_id'] = transcript_offer['event_value'].apply(lambda x: map_offer[x])
transcript_offer['offer_type'] = transcript_offer['offer_id'].apply(lambda x: map_offer_type[x])

In [49]:
#group informational offer type together

offer27_user_id = transcript_offer[transcript_offer['offer_id'].isin([2,7])]['person'].unique()

only_27 = list()
for user in offer27_user_id:
    offer_received = set(transcript_offer[transcript_offer['person']==user]['offer_id'])
    other_offer = {0,1,3,4,5,6,8,9}
    if len(offer_received.intersection(other_offer))==0:
        only_27.append(user)

[898,
 1118,
 1140,
 1597,
 1815,
 2670,
 6157,
 4165,
 9604,
 7603,
 15932,
 8299,
 8836,
 8941,
 2815,
 11356,
 11367,
 11046,
 12124,
 13262,
 13456,
 14059,
 5374,
 15900,
 16402,
 186,
 799,
 2113,
 2166,
 13853,
 3039,
 4924,
 5605,
 6470,
 10301,
 11220,
 11771,
 12388,
 12422,
 15782,
 16152,
 16155,
 16262,
 4402,
 5464,
 6110,
 7978,
 12133,
 12666,
 12391,
 13405,
 13671,
 13873,
 14607,
 16221,
 5296,
 6749,
 8174,
 9854,
 14142,
 14578,
 4546,
 4429,
 5885,
 14321,
 12537]

In [None]:
## method to define 1/0/NAN for offer type 2,7

offer2_record = transcript_offer[transcript_offer['offer_id']==2]
offer2_user_id = offer2_record['person'].unique()
response_list = []

for user in offer2_user_id:

    record_2_user = offer2_record[offer2_record['person']==user] 
    record_trans = transcript[(transcript['person']==user) & (transcript['event']=='transaction')]
    
    t_received = record_2_user[record_2_user['event']=='offer received']['time'].values

    tot_viewed = 0
    tot_trans = 0
    
    for i in range(len(t_received)):
        t = t_received[i]
        viewed = record_2_user[(record_2_user['event']=='offer viewed')
                              & (record_2_user['time']>t) 
                              & (record_2_user['time']<(t+96))]
        
        transaction = record_trans[(record_trans['time']>t) & (record_trans['time']<(t+96))]
        
        if len(viewed)>0:
            tot_viewed+=1
            
        if len(transaction)>0:
            tot_trans+=1
                
    if tot_viewed==0:
        response_list.append(np.nan)
    elif (tot_viewed>0) & (tot_trans==0):
        response_list.append(0)
    else:
        response_list.append(1)        
        

In [224]:
response_list

[1]

In [186]:
transcript[transcript['person']==22]

Unnamed: 0,person,event,value,time,event_value
13,22,offer received,{'offer id': '3f207df678b143eea3cee63160fa8bed'},0,3f207df678b143eea3cee63160fa8bed
20285,22,offer viewed,{'offer id': '3f207df678b143eea3cee63160fa8bed'},18,3f207df678b143eea3cee63160fa8bed
20286,22,transaction,{'amount': 13.25},18,13.25
35540,22,transaction,{'amount': 18.9},72,18.9
49503,22,transaction,{'amount': 20.2},144,20.2
53188,22,offer received,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},168,5a8bc65990b245e5a138643cd4eb9837
77214,22,offer viewed,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},192,5a8bc65990b245e5a138643cd4eb9837
81399,22,transaction,{'amount': 25.27},204,25.27
110841,22,offer received,{'offer id': '3f207df678b143eea3cee63160fa8bed'},336,3f207df678b143eea3cee63160fa8bed
130153,22,offer viewed,{'offer id': '3f207df678b143eea3cee63160fa8bed'},348,3f207df678b143eea3cee63160fa8bed


In [9]:
all_offer_id = [0,1,3,4,5,6,8,9]
all_user_id = transcript_offer['person'].unique()
offer_df = pd.DataFrame(all_user_id, columns=['user_id'])

# optimise this code below:
# loop thru user but build a list for each offer type, then add to a dataframe by columns instead of "append"
# loop offer type first, then loop user --> each loop create a list, add each list to dataframe

test_user_list = []
for offer in all_offer_id:
    
    offer_record = transcript_offer[transcript_offer['offer_id']==offer]
    response_list = []
    
    for person in all_user_id:
        
        record = offer_record[offer_record['person']==person]
        received = len(record[record['event']=='offer received'])
        if received==0:
            response_list.append(np.nan)
            continue
        
        viewed = len(record[record['event']=='offer viewed'])
        completed = len(record[record['event']=='offer completed'])
        
        if (received>0) & (completed>0) & (viewed>=completed):
            response_list.append(1)
        
        else:
            response_list.append(0)
        
        
    offer_df[offer] = response_list
    
    print('Complete creating records for offer id {}'.format(offer))

Complete creating records for offer id 0
Complete creating records for offer id 1
Complete creating records for offer id 2
Complete creating records for offer id 3
Complete creating records for offer id 4
Complete creating records for offer id 5
Complete creating records for offer id 6
Complete creating records for offer id 7
Complete creating records for offer id 8
Complete creating records for offer id 9


In [56]:
del offer_df[2]
del offer_df[7]

In [58]:
offer_df.dropna(how='all', inplace=True)
offer_df.set_index('user_id', inplace=True)

In [106]:
X_user = pd.get_dummies(profile[['gender','age_group','membership_group','income_group']])
X_user.index = profile['user_id']

user_with_record = set(X_user.index).intersection(offer_df.index)
X_user = X_user.loc[user_with_record]
offer_df = offer_df.loc[user_with_record]

from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X_user, offer_df, random_state=1)

In [115]:
def FunkSVD(input_mat, latent_features=4, learning_rate=0.01, iters=200):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization
    
    INPUT:
    input_mat - (numpy array) a matrix with users as rows, movies as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    response_mat - (numpy array) a latent feature by movie matrix
    '''
    
    # Set up useful values to be used through the rest of the function
    n_users = input_mat.shape[0]
    n_response = input_mat.shape[1]
    num_record = np.count_nonzero(~np.isnan(input_mat))
    
    # initialize the user and response matrices with random values
    user_mat = np.random.rand(n_users,latent_features)
    response_mat = np.random.rand(latent_features,n_response)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # header for running results
    print("Optimization Statistics")
    print("Iterations | Mean Squared Error ")
    
    # for each iteration
    for _n in range(iters):
        # update our sse
        old_sse = sse_accum
        sse_accum = 0
        for x in range(n_users):
            for y in range(n_response):
                if input_mat[x, y] >= 0:
                    error = input_mat[x,y]-np.dot(user_mat[x,:], response_mat[:,y])
                    sse_accum += error**2
                    for i in range(latent_features):
                        user_mat[x,i] += 2*learning_rate*error*response_mat[i,y]
                        response_mat[i,y] += 2*learning_rate*error*user_mat[x,i]
                else:
                    continue
        if _n%20 ==0:
            print('{}, {}'.format(_n, sse_accum))
        
    return user_mat, response_mat 

In [116]:
Y_train_mat = np.array(Y_train)
user_mat, response_mat = FunkSVD(Y_train_mat, latent_features=8, iters=400)
Y_train_filled = np.around(np.dot(user_mat,response_mat))
Y_train_filled[Y_train_filled<0] = 0
Y_train_filled[Y_train_filled>1] = 1
Y_train_filled_df = pd.DataFrame(Y_train_filled, index=X_train.index)


Optimization Statistics
Iterations | Mean Squared Error 
0, 9563.25535590397
20, 7572.307250070272
40, 5705.052689330767
60, 3548.1116599319566
80, 2154.2037747313825
100, 1288.962904886553
120, 736.1653405719981
140, 415.637980130353
160, 240.55157957680396
180, 143.7169396654755
200, 88.35282008151054
220, 55.84035308287259
240, 36.34826395763349
260, 24.42473901404937
280, 16.967435459875134
300, 12.187196909365312
320, 9.04033331750929
340, 6.910062472364153
360, 5.426369843287197
380, 4.363491360689605


In [177]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier


# 3,4 offer type --> random forest 
# 0,1,5,6,8,9 offer type --> SGD with log
# 2,7 --> send to everybody

clf1 = MultiOutputClassifier(RandomForestClassifier(n_estimators=300, min_samples_split=4, 
                                                   random_state=1, class_weight='balanced'))

clf2 = MultiOutputClassifier(MultinomialNB())
#clf3 = MultiOutputClassifier(SGDClassifier(loss='log'))

clf1.fit(X_train,Y_train_filled_df)
clf2.fit(X_train,Y_train_filled_df)

Y_pred_1 = clf1.predict(X_test)
Y_pred_2 = clf2.predict(X_test)

for i in range(8):
    y_test = np.array(Y_test.iloc[:,i])
    y_pred1 = Y_pred_1[:,i]
    y_pred2 = Y_pred_2[:,i]
    idx = np.where(y_test>-1)[0]

    y_test = y_test[idx]
    y_pred1 = y_pred1[idx]
    y_pred2 = y_pred2[idx]
    
    print('Randomforest Classifier - test output for offer type {}'.format(i))
    print(f1_score(y_test,y_pred1))
    
    print('SGD Classifier - test output for offer type {}'.format(i))
    print(f1_score(y_test,y_pred2))

Randomforest Classifier - test output for offer type 0
0.6680384087791496
SGD Classifier - test output for offer type 0
0.6915887850467289
Randomforest Classifier - test output for offer type 1
0.688723835246455
SGD Classifier - test output for offer type 1
0.7190868738110336
Randomforest Classifier - test output for offer type 2
0.51285930408472
SGD Classifier - test output for offer type 2
0.46525679758308164
Randomforest Classifier - test output for offer type 3
0.41658440276406716
SGD Classifier - test output for offer type 3
0.16541353383458646
Randomforest Classifier - test output for offer type 4
0.7103707684040839
SGD Classifier - test output for offer type 4
0.8233820459290189
Randomforest Classifier - test output for offer type 5
0.7431472081218274
SGD Classifier - test output for offer type 5
0.824
Randomforest Classifier - test output for offer type 6
0.7287946428571429
SGD Classifier - test output for offer type 6
0.7730933207010896
Randomforest Classifier - test output fo

In [146]:
y_train = Y_train[0].dropna()
idx_ = y_train.index
x_train = X_train.loc[idx_]

clf = RandomForestClassifier(n_estimators=300, random_state=1, class_weight='balanced')

#clf = MultiOutputClassifier(MultinomialNB())
# SGD or multinomialNB has very good performance for offer 5,6,8 (80% test score) but very bad on all others
#clf = MultiOutputClassifier(SGDClassifier())
#clf = MultiOutputClassifier(LinearSVC())

clf.fit(x_train,y_train)

y_pred_train = clf.predict(x_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

         0.0       0.80      0.71      0.75      2712
         1.0       0.67      0.77      0.72      2094

    accuracy                           0.74      4806
   macro avg       0.74      0.74      0.74      4806
weighted avg       0.75      0.74      0.74      4806



In [162]:
y_test = Y_test[4].dropna()
idx_ = y_test.index
x_test = X_test.loc[idx_]
y_pred_test = clf.predict(x_test)
print(f1_score(y_test,y_pred_test))

0.4110091743119266


In [178]:
portfolio

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id,offer_id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,0
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed,2
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,3
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,4
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,5
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,6
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837,7
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,8
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5,9
