In [98]:
import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB

In [2]:
portfolio = pd.read_csv("data/updated_portfolio.csv", index_col=0)
influenced = pd.read_csv("data/influenced.csv", header=0, index_col=0)
received = pd.read_csv("data/received.csv", header=0, index_col=0)
identified = pd.read_csv("data/identified.csv", index_col=0)

### use dummy variables for promotion type and days since first join for became_member_on

In [3]:
portfolio = portfolio.join(pd.get_dummies(portfolio.offer_type), how='inner')
portfolio.drop(['offer_type'], axis=1, inplace=True)

In [4]:
portfolio.head(2)

Unnamed: 0_level_0,reward,difficulty,duration,email,mobile,social,web,bogo,discount,informational
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,1,1,1,0,1,0,0
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,1,1,1,1,1,0,0


In [5]:
identified.became_member_on = pd.to_datetime(identified.became_member_on)
earliest = identified.became_member_on.min()
time_delta_series = identified.became_member_on.apply(lambda x: (x - earliest))
identified['days_since_member'] = time_delta_series.dt.days
identified.drop(['became_member_on'], axis=1, inplace=True)

### Construct the output Y

In [6]:
received = received.T
# only include individuals with data
received = received.loc[received.index.isin(identified.index)]
influenced = influenced.loc[influenced.index.isin(identified.index)]

In [7]:
# if received has some one and influenced doesn't, add a row of 0 in influenced
missing_set = set(received.index) - set(influenced.index)
for customer in missing_set:
    influenced.loc[customer] = 0

In [8]:
def create_output(receive, influence):
    if receive == 1 and influence == 1:
        return 1
    if receive == 1 and influence == 0:
        return 0
    else:
        return -1
    
create_vec = np.vectorize(create_output)

interim_Y = pd.DataFrame(create_vec(received, influenced), columns=received.columns, index=received.index)

In [9]:
def create_data(portfolio, identified, interim_Y):
    collection_rows = []
    for customer, y_row in interim_Y.iterrows():
        for index, val in y_row.items():
            if val > -1:
                promo = portfolio.loc[portfolio.index == index]
                person = identified.loc[identified.index == customer]
                
                promo.reset_index(drop=True, inplace=True)
                person.reset_index(drop=True, inplace=True)
                
                Y = pd.Series([val])
                
                row = pd.concat([promo, person, Y], axis=1)
                collection_rows.append(row)
    complete_df = pd.concat(collection_rows, axis=0)
    columns = list(portfolio.columns.append(identified.columns))
    columns.append('Y')
    complete_df.columns = columns
    return complete_df

In [10]:
complete_df = create_data(portfolio, identified, interim_Y)

In [11]:
complete_df.head(3)
# drop email, always true
complete_df.drop('email', axis=1, inplace=True)

In [58]:
temp_df = shuffle(complete_df.copy())

In [60]:
good = temp_df[temp_df.Y == 0].iloc[:10194, :]
bad = temp_df[temp_df.Y == 1]
balanced_data = good.append(bad)

In [18]:
X

Unnamed: 0,reward,difficulty,duration,mobile,social,web,bogo,discount,informational,age,income,F,M,O,days_since_member
0,10,10,7,1,1,0,1,0,0,59,55000.0,1,0,0,323
0,0,0,3,1,1,0,0,0,1,59,55000.0,1,0,0,323
0,10,10,5,1,1,1,1,0,0,74,85000.0,1,0,0,1576
0,0,0,4,1,0,1,0,0,1,74,85000.0,1,0,0,1576
0,5,20,10,0,0,1,0,1,0,74,85000.0,1,0,0,1576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,0,4,1,0,1,0,0,1,73,82000.0,1,0,0,1488
0,2,10,7,1,0,1,0,1,0,73,82000.0,1,0,0,1488
0,10,10,5,1,1,1,1,0,0,53,70000.0,1,0,0,1736
0,2,10,10,1,1,1,0,1,0,53,70000.0,1,0,0,1736


In [70]:
# create test and train sets
X = balanced_data.drop(['Y'], axis=1)
Y = balanced_data['Y']

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.33)

In [13]:
complete_df.corr()

Unnamed: 0,reward,difficulty,duration,mobile,social,web,bogo,discount,informational,age,income,F,M,O,days_since_member,Y
reward,1.0,0.465321,0.159424,-0.077473,0.288067,-0.120764,0.793036,-0.288757,-0.617848,-0.004831,-0.003889,-0.003697,0.004916,-0.005178,-0.003074,0.070462
difficulty,0.465321,1.0,0.808283,-0.742514,-0.155425,0.243415,-0.029684,0.597746,-0.695839,-0.002453,-0.001303,-0.000939,0.00284,-0.007978,-0.005799,-0.001075
duration,0.159424,0.808283,1.0,-0.530886,-0.185316,0.340135,-0.18581,0.742157,-0.681462,-0.000708,0.000775,-0.000598,0.001622,-0.004299,-0.002662,0.019093
mobile,-0.077473,-0.742514,-0.530886,1.0,0.410981,-0.168426,0.274101,-0.411057,0.167716,-0.002614,0.000852,-0.002705,0.001716,0.004089,0.006631,0.089643
social,0.288067,-0.155425,-0.185316,0.410981,1.0,-0.409815,0.249665,-0.166435,-0.101994,-0.000608,0.002153,-0.003222,0.003648,-0.001846,-0.00029,0.16973
web,-0.120764,0.243415,0.340135,-0.168426,-0.409815,1.0,-0.105612,0.409739,-0.372521,-0.002148,-0.002678,-0.001987,0.002158,-0.000753,0.006803,-0.024946
bogo,0.793036,-0.029684,-0.18581,0.274101,0.249665,-0.105612,1.0,-0.666821,-0.408295,-0.003616,-0.004621,-0.002809,0.002828,-0.000131,-0.000204,0.056861
discount,-0.288757,0.597746,0.742157,-0.411057,-0.166435,0.409739,-0.666821,1.0,-0.408012,-0.001218,0.003627,6.1e-05,0.000684,-0.003117,-0.000434,0.006122
informational,-0.617848,-0.695839,-0.681462,0.167716,-0.101994,-0.372521,-0.408295,-0.408012,1.0,0.005922,0.001218,0.003367,-0.004302,0.003979,0.000782,-0.07716
age,-0.004831,-0.002453,-0.000708,-0.002614,-0.000608,-0.002148,-0.003616,-0.001218,0.005922,1.0,0.305109,0.144296,-0.144998,0.005738,-0.009975,0.005341


In [95]:
mnb = MultinomialNB().fit(X_train, Y_train)
preds = mnb.predict(X_test)

In [84]:
def evaluation_metrics(preds, actual, verbose=True):
    tp = np.logical_and(actual, preds).sum()
    fp = np.logical_and(1-actual, preds).sum()
    tn = np.logical_and(1-actual, 1-preds).sum()
    fn = np.logical_and(actual, 1-preds).sum()
  
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    precision = tp / (tp + fp)
    f1 = tp / (tp + 0.5*(fp + fn))
    
    # printing a table of metrics
    if verbose:
        print(pd.crosstab(actual, preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('F1:', f1))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy, 'F1': f1}
    
    
    return recall, f1, accuracy

In [96]:
_ = evaluation_metrics(preds, Y_test)

prediction (col)     0     1
actual (row)                
0                 1625  1787
1                 1509  1808

Recall:     0.545
Precision:  0.503
F1:         0.523
Accuracy:   0.510



In [88]:
complete_df.to_csv("data/complete_df.csv")
X_train.to_csv("data/X_train.csv")
X_test.to_csv("data/X_test.csv")
Y_train.to_csv("data/Y_train.csv")
Y_test.to_csv("data/Y_test.csv")