# Test build 1

- based on __org, state, NTEE__
- using __fastai__ library

In [145]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd
from fastai.tabular.all import *
from fastai.losses import L1LossFlat, MSELossFlat
import torch.nn.functional as F

In [146]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [147]:
device

device(type='cpu')

# modeling conversion

## load & prep dataset

In [148]:
# pull data
df = pd.read_csv("seg_state.csv")

# if one hot encoding slips in, remove
if 'segment_E' in df.columns:
    for c in [col for col in df.columns if 'segment_' in col]:
        df.drop(c, axis=1, inplace=True)

df.head(2)

Unnamed: 0,org,week,pageviews,count_all,vol_all,count_onetime,count_recurring,avg_onetime,avg_recurring,state,segment,conversion_ot,conversion_rec
0,442289,2021-02-08,118,4.0,300.0,4.0,0.0,75.0,0.0,CA,B - Educational Institutions,0.033898,0.0
1,442289,2023-01-23,3464,0.0,0.0,0.0,0.0,0.0,0.0,CA,B - Educational Institutions,0.0,0.0


In [327]:
print("{:,} observations".format(len(df)))

644,186 observations


In [149]:
targets = ['conversion_ot', 'conversion_rec', 'avg_onetime', 'avg_recurring']
targets = ['conversion_rec', 'avg_onetime', 'avg_recurring']
target = 'conversion_ot'

drop_cols = ['org', 'week', 'pageviews', 'count_all', 'vol_all', 'count_onetime', 'count_recurring']

In [150]:
df_final = df.drop(drop_cols, axis=1)

In [151]:
df_final.head(2)

Unnamed: 0,avg_onetime,avg_recurring,state,segment,conversion_ot,conversion_rec
0,75.0,0.0,CA,B - Educational Institutions,0.033898,0.0
1,0.0,0.0,CA,B - Educational Institutions,0.0,0.0


In [152]:
df_final.isna().sum()

avg_onetime           0
avg_recurring         0
state             21303
segment               0
conversion_ot         0
conversion_rec        0
dtype: int64

In [153]:
print(len(df_final[df_final['state'].isna()]) / len(df_final))
df_final[df_final['state'].isna()].head(2)

0.033069641376869414


Unnamed: 0,avg_onetime,avg_recurring,state,segment,conversion_ot,conversion_rec
2918,0.0,0.0,,0,0.0,0.0
2919,0.0,0.0,,0,0.0,0.0


In [154]:
# fill NA with 0's
#df_final.fillna(0, inplace=True)

# drop NA states
df_final = df_final[~df_final['state'].isna()]

In [155]:
df_final = df_final.drop(targets, axis=1)
df_final.head(2)

Unnamed: 0,state,segment,conversion_ot
0,CA,B - Educational Institutions,0.033898
1,CA,B - Educational Institutions,0.0


In [328]:
print("{:,} observations".format(len(df_final)))
df_final[['state', 'segment']].nunique()

622,883 observations


state      64
segment    31
dtype: int64

## modeling

### embedding [250, 250]

Trainings:
- 25 epochs, (1e-5, 1e-3): train: 47.82 - 23.87; valid: 38.30 - 38.22
- 25 epochs, (1e-5, 1e-3): train: 33.34 - 30.01; valid: 25.49 - 25.42
- 50 epochs, (1e-5, 1e-3): train: 42.66 - 36.16; valid: 29.76 - 29.66
- 100 epochs, (1e-5, 1e-3): train: 19.49 - 25.25; valid: 50.65 - 50.57
- 25 epochs, (1e-4, 1e-2): train: 59.44 - 22.13; valid: 23.50 - 49.36 (fell to 23.38 before spiking to 777 near the end of training, learning rate too aggressive)
- 25 epochs, (1e-3, 1e-6): train: 26.52 - 36.12; valid: 18.28 - 18.26
- 100 epochs, (1e-5, 1e-3): train: 56.53 - 19.64; valid: 22.95 - 22.89

Notes:
- while the train loss varies wildly, occasionally swinging as high as 182 and as low as 16, the validation loss does appear to consistently fall, albeit very slowly; this behavior is consistent across all training cycles
- validation loss doesn't fall to the same region regardless of starting point, it appears to fall at approximately the same rate; it seems that starting in a good spot is critical to achieving a good validation score
    - i believe the most likely cause of this is the training/validation split rather than the random state of the embedding; the cases of worst validation scores is most likely caused by a sufficient portion of the validation set consisting of outliers in the dataset

### embedding [20]

Trainings:
- 25 epochs, (1e-5, 1e-3): train 67.86 - 41.46; valid: 32.19 - 32.07
- 25 epochs, (1e-6, 1e-3): train: 27.28 - 30.46; valid: 34.32 - 24.25
- 150 epochs, (1e-6, 1e-3): train: 31.26 - 36.86; valid: 23.05 - 23.04

Notes:
- reducing the embedding dimensions did not meaningfully impact accuracy but reduced training time by about 40%
- again, increasing the number of epochs does not appear to meaningfully change training outcomes; training accuracy varies wildly while the validation steadily falls, but only to a point

### embedding [20,20] x 250

Trainings:
- 25 epochs, (1e-6, 1e-3): train: 33.90 - 29.26; valid: 65.79 - 65.65
- 25 epochs, (1e-6, 1e-3): train: 38.65 - 39.37; valid: 32.64 - 32.58
- 25 epochs, (1e-6, 1e-3): train: 34.08 - 67.61; valid: 24.44 - 24.36

### embedding [10, 5] x 250

Trainings:
- 25 epochs, (1e-6, 1e-3): train: 62.24 - 19.51; valid: 43.38 - 43.29


In [299]:
# prep dataset
cont, cat = cont_cat_split(df_final, dep_var=target)
procs = [Categorify, Normalize]
splits = RandomSplitter()(df_final)

In [300]:
to_nn = TabularPandas(df_final, procs, cat, cont,
                      splits=splits, y_names=target)

In [301]:
dls = to_nn.dataloaders(512, device=device)

In [302]:
learn = tabular_learner(dls, layers=[250], emb_szs={'state': 10, 'segment': 5},  n_out=1, 
                        loss_func=F.mse_loss, metrics=F.l1_loss)

In [303]:
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,l1_loss,time
0,62.236107,43.381557,0.868157,00:10


In [304]:
learn.fit_one_cycle(25, slice(1e-6, 1e-3))

epoch,train_loss,valid_loss,l1_loss,time
0,31.765942,43.373096,0.87395,00:10
1,23.445301,43.35577,0.87516,00:10
2,21.452023,43.362991,0.850853,00:10
3,220.242691,43.381699,0.836783,00:10
4,54.521069,43.421326,0.909946,00:10
5,106.326447,43.380466,0.924804,00:10
6,21.536793,43.357868,0.861895,00:10
7,27.256626,43.342564,0.875719,00:10
8,80.422218,43.344486,0.885862,00:10
9,29.525326,43.364082,0.851929,00:10


In [81]:
to_nn

        state  segment  conversion_ot
441541     45       22       0.000000
193233     14       23       0.000000
595225     69        3       0.200000
241492     76       23       0.000000
604956     16       18       0.120000
...       ...      ...            ...
65504      46       14       0.047639
551805     24       17       2.666667
446638     40       28       0.000000
490694     87        3       0.000000
502167     33        3       0.166667

[645669 rows x 3 columns]

## extract embeddings

The embeddings can now be extracted and examined as feature vectors. These can be used independently to deploy models targeting states, NTEE's, or the combination of the two.

In [318]:
from sklearn.ensemble import RandomForestRegressor

In [309]:
learn.dls.cat_names

(#2) ['state','segment']

In [307]:
learn.model.embeds

ModuleList(
  (0): Embedding(65, 10)
  (1): Embedding(32, 5)
)

In [308]:
embed(learn.model.embeds[1])

Embedding(32, 5)

In [315]:
# extract feature vectors
def embed_features(learner, xs):
    xs = xs.copy()
    for i, col in enumerate(learn.dls.cat_names):
        # get matrix containing each rows embedding vector
        embed = learn.model.embeds[i]
        embed_data = embed(tensor(xs[col], dtype=torch.int64))
        embed_names = [f"{col}_{j}" for j in range(embed_data.shape[1])]
        
        # join embedded category and drop old feature column
        feature_df = pd.DataFrame(data=embed_data, index=xs.index,
                                  columns=embed_names)
        xs = xs.drop(col, axis=1)
        xs = xs.join(feature_df)
        
    return xs

In [316]:
embed_xs = embed_features(learn, to_nn.train.xs)
embed_valid_xs = embed_features(learn, to_nn.valid.xs)

In [317]:
embed_xs

Unnamed: 0,state_0,state_1,state_2,state_3,state_4,state_5,state_6,state_7,state_8,state_9,segment_0,segment_1,segment_2,segment_3,segment_4
611311,-0.011846,-0.123364,0.259603,0.121549,0.036781,0.197872,-0.206547,0.059092,0.051700,-0.123717,-0.075598,0.067019,0.041926,-0.252073,0.001269
212363,-0.158456,-0.284497,0.017337,-0.038429,-0.193495,0.048342,-0.034891,-0.005847,-0.170809,-0.049058,0.154464,-0.074095,0.243728,-0.358695,0.150056
390362,0.017886,0.181342,0.178255,-0.089012,-0.103411,0.046742,0.111108,0.085520,-0.119589,0.319634,0.120172,-0.117866,0.161961,-0.403044,0.157002
434961,0.057060,0.028104,-0.044237,0.038866,-0.081878,-0.000968,0.054063,0.042011,-0.136742,0.005524,0.137893,-0.177939,0.068862,-0.370478,0.219140
28995,-0.093847,-0.043996,0.237923,0.246155,-0.096813,0.297484,-0.047941,0.019421,0.004506,-0.005959,-0.109405,-0.052204,-0.173895,-0.346847,0.049326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630870,-0.178587,0.070367,-0.017216,-0.078315,-0.273307,0.087803,0.098873,-0.070163,-0.196333,0.145273,-0.075598,0.067019,0.041926,-0.252073,0.001269
566385,-0.023177,-0.215017,-0.152210,-0.182119,0.105180,0.378175,0.146743,0.276516,0.030818,0.053094,-0.030546,0.004196,0.063070,-0.030061,-0.114330
196939,-0.104725,-0.168313,-0.179705,-0.010252,-0.007058,0.387437,0.225153,0.226210,-0.014997,0.161421,-0.040214,0.020944,-0.232964,-0.362123,-0.064898
56210,-0.098020,-0.092968,-0.108004,-0.141455,-0.185084,0.206683,0.280680,0.139412,-0.174212,0.458103,-0.075598,0.067019,0.041926,-0.252073,0.001269


In [319]:
rf = RandomForestRegressor()
rf.fit(embed_xs, to_nn.train.y)

RandomForestRegressor()

In [320]:
rf.feature_importances_

array([0.0919876 , 0.06232346, 0.03996753, 0.04273717, 0.04785761,
       0.06992517, 0.034316  , 0.05330741, 0.0465455 , 0.06563766,
       0.12412338, 0.06826368, 0.06640641, 0.10977912, 0.0768223 ])

In [321]:
# most important features: state_0, segment_0
len(embed_xs)

498307