In [4]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

In [141]:
from datetime import datetime

In [179]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import os, random, datetime

# load data

## load

In [87]:
q = '''select id as org, state, segment from organization'''
orgs = redshift_query_read(q, schema='production')

In [88]:
print("{:,} entries".format(len(orgs)))
print("{:,} unique org's".format(len(orgs['org'].unique())))

14,600 entries
14,600 unique org's


In [89]:
q = '''select
            org,
            date_trunc('week', date) as week,
            count(id) as count_all,
            sum(amount) as vol_all,
            count(distinct(case when recurring=0 then id else null end)) as count_onetime,
            count(distinct(case when recurring_origin=1 then id else null end)) as count_recurring
        from transactions
        where status='A' and year >= 2018
        group by org, date_trunc('week', date)'''
trans = redshift_query_read(q, schema='production')

In [90]:
print("{:,} entries".format(len(trans)))
print("{:,} unique org's".format(len(trans['org'].unique())))
print("{} - {}".format(trans['week'].min(), trans['week'].max()))

568,985 entries
7,500 unique org's
2018-01-01 00:00:00 - 2024-03-11 00:00:00


In [91]:
q = '''select
            org,
            date_trunc('week', date) as week,
            sum(views) as pageviews
        from ga
        where date >= 2018
        group by org, date_trunc('week', date)'''
traffic = redshift_query_read(q, schema='production')

In [92]:
print("{:,} traffic entries".format(len(traffic)))
print("{:,} unique org's".format(len(traffic['org'].unique())))
print("{} - {}".format(traffic['week'].min(), traffic['week'].max()))

646,071 traffic entries
8,644 unique org's
2018-01-01 00:00:00 - 2024-02-26 00:00:00


## merge

In [106]:
df = traffic.merge(trans, on=['org', 'week'], how='left').merge(orgs, on='org').fillna(0)
df['conversion_ot'] = df['count_onetime'] / df['pageviews']
df['conversion_rec'] = df['count_recurring'] / df['pageviews']

In [107]:
len(trans), len(traffic.merge(trans, on=['org', 'week'])), len(traffic.merge(trans, on=['org', 'week'], how='left'))

(568985, 393393, 646071)

In [108]:
print("{:,} entries".format(len(df)))
print("{:,} unique org's".format(len(df['org'].unique())))
print("{} - {}".format(df['week'].min(), df['week'].max()))

645,669 entries
8,612 unique org's
2018-01-01 00:00:00 - 2024-02-26 00:00:00


In [109]:
df.drop('org', axis=1).describe()

Unnamed: 0,pageviews,count_all,vol_all,count_onetime,count_recurring,conversion_ot,conversion_rec
count,645669.0,645669.0,645669.0,645669.0,645669.0,644509.0,644220.0
mean,248.794799,19.56064,2395.276,11.60069,0.47772,inf,inf
std,2193.703746,160.673246,14482.27,79.461898,10.534647,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,2.0,100.0,1.0,0.0,0.005479452,0.0
75%,102.0,9.0,1146.0,5.0,0.0,0.1,0.0
max,473098.0,30760.0,1880078.0,24073.0,3034.0,inf,inf


In [117]:
conv_ot = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().describe()
conv_rec = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().describe()

pd.DataFrame([conv_ot, conv_rec]).transpose()

Unnamed: 0,conversion_ot,conversion_rec
count,644186.0,644186.0
mean,0.513355,0.015985
std,6.794323,0.603654
min,0.0,0.0
25%,0.0,0.0
50%,0.005405,0.0
75%,0.1,0.0
max,2320.5,379.25


In [114]:
count_zero = len(df[df['count_all']==0])
traffic_zero = len(df[df['pageviews']==0])
onetime_zero = len(df[df['count_onetime']==0])
rec_zero = len(df[df['count_recurring']==0])

trans_gt_traff = len(df[df['count_all']>df['pageviews']])
rec_gt_ot = len(df[df['count_onetime']<df['count_recurring']])
ottrans_gt_traff = len(df[df['count_onetime']>df['pageviews']])
rectrans_gt_traff = len(df[df['count_recurring']>df['pageviews']])
rectrans_gt_std = len(df[df['count_recurring']>13])

conv_ot_mean = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().mean()
conv_ot_median = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().median()
conv_rec_mean = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().mean()
conv_rec_median = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().median()
conv_mean = conv_ot_mean + conv_rec_mean
conv_median = conv_ot_median + conv_rec_median

len_all = len(df)

print("Zero transactions: {:,} ({:.1f}%)".format(count_zero, (count_zero / len_all) * 100.))
print("Zero traffic: {:,} ({:.1f}%)".format(traffic_zero, (traffic_zero / len_all) * 100.))
print("Zero one time: {:,} ({:.1f}%)".format(onetime_zero, (onetime_zero / len_all) * 100.))
print("Zero recurring: {:,} ({:.1f}%)".format(rec_zero, (rec_zero / len_all) * 100.))
print()
print("Trans > traffic: {:,} ({:.1f}%)".format(trans_gt_traff, (trans_gt_traff / len_all) * 100.))
print("One time trans > traffic: {:,} ({:.1f}%)".format(ottrans_gt_traff, (ottrans_gt_traff / len_all) * 100.))
print("Recurring trans > traffic: {:,} ({:.1f}%)".format(rectrans_gt_traff, (rectrans_gt_traff / len_all) * 100.))
print("Recurring trans > one time: {:,} ({:.1f}%)".format(rec_gt_ot, (rec_gt_ot / len_all) * 100.))
print("Recurring trans > std: {:,} ({:.1f}%)".format(rectrans_gt_std, (rectrans_gt_std / len_all) * 100.))
print()
print("Conversion mean: {:.2f}%".format(conv_mean * 100.))
print("Conversion median: {:.2f}%".format(conv_median * 100.))
print("Conversion one time mean: {:.2f}%".format(conv_ot_mean * 100.))
print("Conversion one time median: {:.2f}%".format(conv_ot_median * 100.))
print("Conversion recurring mean: {:.2f}%".format(conv_rec_mean * 100.))
print("Conversion recurring median: {:.2f}%".format(conv_rec_median * 100.))

Zero transactions: 252,277 (39.1%)
Zero traffic: 1,483 (0.2%)
Zero one time: 311,798 (48.3%)
Zero recurring: 572,131 (88.6%)

Trans > traffic: 55,247 (8.6%)
One time trans > traffic: 32,260 (5.0%)
Recurring trans > traffic: 1,115 (0.2%)
Recurring trans > one time: 8,511 (1.3%)
Recurring trans > std: 2,122 (0.3%)

Conversion mean: 52.93%
Conversion median: 0.54%
Conversion one time mean: 51.34%
Conversion one time median: 0.54%
Conversion recurring mean: 1.60%
Conversion recurring median: 0.00%


In [131]:
segments = ['E', 'R', 'O', 'P', 'A', 'S', 'D', 'G', 'W', 'X', 'B', 'L',
            'C', 'K', 'F', 'I', 'T', 'Q', 'U', 'H', 'J', 'N', 'M', 'V', 
            'Y', 'D', 'T', 'R', 'Y', 'Z']
for segment in segments:
    df["segment_{}".format(segment)] = df['segment'].str.contains('{} - '.format(segment))
    df["segment_{}".format(segment)] = df["segment_{}".format(segment)].fillna(False)

In [134]:
df[[c for c in df.columns if 'segment' in c]].head(2)

Unnamed: 0,segment,segment_E,segment_R,segment_O,segment_P,segment_A,segment_S,segment_D,segment_G,segment_W,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
0,A - Arts; Culture; and Humanities,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A - Arts; Culture; and Humanities,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [135]:
df[[c for c in df.columns if 'segment' in c]].sum()

  """Entry point for launching an IPython kernel.


segment_E    31617
segment_R     9056
segment_O    56740
segment_P    92517
segment_A    40142
segment_S    18803
segment_D    20966
segment_G    21669
segment_W     9023
segment_X    79796
segment_B    73777
segment_L    22541
segment_C    18507
segment_K    13247
segment_F    15251
segment_I     8527
segment_T    21604
segment_Q    14146
segment_U     1250
segment_H     4413
segment_J     4016
segment_N    11731
segment_M     4501
segment_V     1743
segment_Y     1981
segment_Z    14698
dtype: int64

In [101]:
print("Segment observations:")
df.groupby('segment')['org'].count().agg(['mean', 'median'])

Segment observations:


mean      20828.032258
median    14146.000000
Name: org, dtype: float64

In [102]:
grpd = df.groupby(['segment', 'week']).count().reset_index()

In [103]:
grpd['org'].agg(['mean', 'median', 'min', 'max'])

mean       72.12567
median     46.00000
min         1.00000
max       442.00000
Name: org, dtype: float64

In [104]:
len_sml = len(grpd[grpd['org']<20])
print("{:,} ({:.1f}%) entries with fewer than 20 observations".format(len_sml, (len_sml / len(grpd)) * 100.))

2,689 (30.0%) entries with fewer than 20 observations


In [105]:
print(df.columns)
df.head(2)

Index(['org', 'week', 'pageviews', 'count_all', 'vol_all', 'count_onetime',
       'count_recurring', 'state', 'segment', 'segment_E', 'segment_R',
       'segment_O', 'segment_P', 'segment_A', 'segment_S', 'segment_D',
       'segment_G', 'segment_W', 'segment_X', 'segment_B', 'segment_L',
       'segment_C', 'segment_K', 'segment_F', 'segment_I', 'segment_T',
       'segment_Q', 'segment_U', 'segment_H', 'segment_J', 'segment_N',
       'segment_M', 'segment_V', 'segment_Y', 'segment_Z'],
      dtype='object')


Unnamed: 0,org,week,pageviews,count_all,vol_all,count_onetime,count_recurring,state,segment,segment_E,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
0,445377,2021-02-15,339,20.0,229.0,20.0,0.0,Kentucky,A - Arts; Culture; and Humanities,False,...,False,False,False,False,False,False,False,False,False,False
1,445377,2021-01-18,270,1.0,10.5,1.0,0.0,Kentucky,A - Arts; Culture; and Humanities,False,...,False,False,False,False,False,False,False,False,False,False


## dataset

In [171]:
drop_cols = ['segment', 'org', 'week', 'count_all', 'vol_all', 'count_onetime', 'count_recurring', 'pageviews']

train_df, test_df = train_test_split(df.drop(drop_cols, axis=1), test_size=0.2)

`train_df` and `test_df` contain both target columns `conversion_ot` and `conversion_rec`. This will need to be accounted for in constructing the `Dataset`'s such that we are not training on one target to model the other.

In [172]:
train_df.shape, test_df.shape

((516535, 29), (129134, 29))

In [173]:
train_df.columns

Index(['state', 'conversion_ot', 'conversion_rec', 'segment_E', 'segment_R',
       'segment_O', 'segment_P', 'segment_A', 'segment_S', 'segment_D',
       'segment_G', 'segment_W', 'segment_X', 'segment_B', 'segment_L',
       'segment_C', 'segment_K', 'segment_F', 'segment_I', 'segment_T',
       'segment_Q', 'segment_U', 'segment_H', 'segment_J', 'segment_N',
       'segment_M', 'segment_V', 'segment_Y', 'segment_Z'],
      dtype='object')

In [174]:
train_df.head(2)

Unnamed: 0,state,conversion_ot,conversion_rec,segment_E,segment_R,segment_O,segment_P,segment_A,segment_S,segment_D,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
143450,NY,0.064516,0.0,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
115034,California,0.0,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
targets = ['conversion_rec', 'conversion_ot']

train_dataset = TensorDataset(torch.tensor(train_df.drop(targets, axis=1).values.astype(np.float32)), df['conversion_ot'].values)
test_dataset = TensorDataset(test_df.drop(targets, axis=1), df['conversion_ot'].values)

# modeling

In [163]:
class EmbedNet(nn.Module):
    def __init__(self, embed_dims, linear_layer_sizes, output_size, embed_dropout, linear_layer_dropouts):
        super().__init__()
        
        # embedding layers
        self.embedding_layers = nn.ModuleList([nn.Embedding(X, y) for X, y in embed_dims])
        self.num_embeddings = sum([y for _, y in embed_dims])
        
        # linear layers
        linear1 = nn.Linear(self.num_embeddings, linear_layer_sizes[0])
        self.linear_layers = nn.ModuleList([linear1] + [nn.Linear(linear_layer_sizes[i], 
                                                                  linear_layer_sizes[i + 1]) for i in range(len(linear_layer_sizes) - 1)])
        
        for linear_layer in self.linear_layers:
            nn.init.kaiming_normal_(linear_layer.weight.data)
            
        # output layer
        self.output_layer = nn.Linear(linear_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)
        
        # batch norm layers
        self.batchnorm1 = nn.BatchNorm1d(self.num_embeddings)
        self.batchnorm_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in linear_layer_sizes])
        
        # dropout layers
        self.embedding_dropout_layer = nn.Dropout(embed_dropout)
        self.dropout_layers = nn.ModuleList([nn.Dropout(size) for size in linear_layer_dropouts])
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, cat_data):
        if self.num_embeddings != 0:
            x = [embedding_layer(cat_data[:, i]) for i, embedding_layer in enumerate(self.embedding_layers)]
            x = torch.cat(x, 1)
            x = self.batchnorm1(x)
            x = self.embedding_dropout_layer(x)
            
        for linear_layer, dropout_layer, batchnorm_layer in zip(self.linear_layers, self.dropout_layers, self.batchnorm_layers):
            x = F.relu(linear_layer(x))
            x = dropout_layer(x)
            x = batchnorm_layer(x)
            
        x = self.output_layer(x)
        x = self.softmax(x)
        
        return x

In [158]:
def train_eval_loop(model: nn.Module, train_dataset: Dataset, test_dataset: Dataset, accuracy_fn: nn.Module,
                    lr=1e-4, num_epochs=5, batch_size=1024, device=None, l2_reg_alpha=0, shuffle_train=True):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)    
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg_alpha)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    for epoch in range(num_epochs):
        epoch_start = datetime.now()
        print("epoch {}".format(epoch))
        
        model.train()
        mean_train_loss, train_batches_n = 0, 0
        for batch, (y, X) in enumerate(train_dataloader):
            X, y = X.to(device), y.to(device)
            
            # forward pass
            y_pred = model(X)[:, 1]
            loss = accuracy_fn(y_pred, y)
            
            # zero gradient
            model.zero_grad()
            # back propogation
            loss.backward()
            
            # step optimizer
            optimizer.step()
            
            mean_train_loss += float(loss)
            train_batches_n += 1
            
        mean_train_loss /= train_batches_n
        print("epoch: {}, {:.4f} sec".format(train_batches_n, (datetime.datetime.now() - epoch_start).total_seconds()))
        print("mean loss on train: {:.4f}".format(mean_train_loss))
        
        model.eval()
        mean_val_loss, val_batches_n = 0, 0
        with torch.no_grad():
            for batch, (y, X) in enumerate(test_dataloader):
                X, y = X.to(device), y.to(device)
                
                y_pred = model(X)[:, 1]
                loss = accuracy_fn(y_pred, y)
                
                mean_val_loss += float(loss)
                auc = roc_auc_score(y.data.cpu(), y_pred.data.cpu().numpy())
                
                val_batches_n += 1
                
        mean_val_loss /= val_batches_n
        print("mean loss on validation: {:.4f}".format(mean_val_loss))
        print("valid batch auc: {:.5f}".format(auc))
        
        

In [166]:
embed_dims = [[35, 35]]
linear_layer_sizes = [50] * 3
linear_dropout_layers = [0.2] * 3

embd = EmbedNet(embed_dims, linear_layer_sizes, 1, 0.2, linear_dropout_layers)
embd

EmbedNet(
  (embedding_layers): ModuleList(
    (0): Embedding(35, 35)
  )
  (linear_layers): ModuleList(
    (0): Linear(in_features=35, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=50, bias=True)
  )
  (output_layer): Linear(in_features=50, out_features=1, bias=True)
  (batchnorm1): BatchNorm1d(35, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm_layers): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (embedding_dropout_layer): Dropout(p=0.2, inplace=False)
  (dropout_layers): ModuleList(
    (0): Dropout(p=0.2, inplace=False)
    (1): Dropout(p=0.2, inplace=False)
    (2): Dropout(p=0.2, inplace=False)
  )
  (softmax): Softmax(dim=1)
)