# Introdution

We're building a model to estimate transition probailities between segments.

# Data 

Data source is the loyalty data. Specifically, we're using the transaction data as the core basis of the final training data.

## Training data schema

Since we are to estimate transition probabilities of customers across different segments throughout time intervals, the training data would be like below:

|   customer_id    |   datetime  |  segment(t)|  segment(t-1)| dynamic features| fixed features |
|:-----------------|:------------|:-----------|:-------------|:----------------|:---------------|
|     12345        | 2021-01-01  |  segment1  |    segment1  |  feature value  |  feature value |
|     12345        | 2021-02-01  |  segment1  |    segment1  |  feature value  |  feature value |
|     12345        | 2021-03-01  |  segment1  |    segment1  |  feature value  |  feature value |
|     56789        | 2021-01-01  |  segment2  |    segment1  |  feature value  |  feature value |
|     56789        | 2021-02-01  |  segment5  |    segment2  |  feature value  |  feature value |
|     56789        | 2021-03-01  |  segment1  |    segment5  |  feature value  |  feature value |

Considering the size of the data, ideally all the data generating process should happen in spark or Hive.

## Features

- RFM-based features
- customer-vec
- metrics based on amount/number of transaction
- specific product purchases

## Model

we can directly estimate transition probability as a function of some time-dependent/independent covariates. It can be modeled as log odds of transitioning from one segment to another, given the current segment:

$$\pi_{ss^\prime} = \log\frac{p(s_t = s | s_{t-1} = s^\prime)}{p(s_t = s^\prime | s_{t-1} = s^\prime)} = \alpha_i + f(\mathbf{X}_t) + g(\mathbf{Z}) + \varepsilon$$

Considering how we create our training data, this is simply a softmax function.


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import sys

sys.path.insert(0, '.')
sys.path.insert(0, '..')

In [8]:
import setenv
setenv.set_env()

In [50]:
import matplotlib.pyplot as plt
import glob
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import column_or_1d
# from catboost import CatBoostClassifier, Pool
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
# from aiutils.utils import Utils
import setenv

## Reading raw data

In [51]:
file_extension = '.csv'
all_filenames = [i for i in glob.glob(f"finalDF_train/*{file_extension}")]

In [None]:
colnames = []  # List of column names
df = pd.concat([pd.read_csv(f, names=colnames, header=None) for f in all_filenames])
df.head()

In [62]:
dummy_cols = ["state_mostCommon", "psaname_mostCommon", "RFMScore", "customer_segment"]
df_ = pd.get_dummies(df, columns=dummy_cols, drop_first=True)

In [63]:
df_ = df_.fillna(0)

In [67]:
drop_cols = ["memberid", "chunkid", "segment_prev", "city_mostCommon", "categoryname_mostCommon"]
dftmp = df_.drop(drop_cols, axis=1, errors='ignore').copy()
dftmp.to_csv("train_df.csv", index=False)

In [69]:
# another way of splitting

memberid_lst = list(df_["memberid"].unique())

test_size = 0.3
split_obs = int(len(memberid_lst)*(1-test_size))
train_lst = memberid_lst[:split_obs]
test_lst = memberid_lst[split_obs:]

train = df_.loc[df_["memberid"].isin(train_lst)].copy()
test = df_.loc[df_["memberid"].isin(test_lst)].copy()

drop_cols = ["memberid", "chunkid", "segment_prev", "city_mostCommon", "categoryname_mostCommon"]
train = train.drop(drop_cols, axis=1, errors='ignore').copy()
test = test.drop(drop_cols, axis=1, errors='ignore').copy()

In [70]:
xcols = [col for col in train.columns if col != 'segment']
ycol = ['segment']
X_train = train[xcols].copy()
y_train = train[ycol].copy()
X_test = test[xcols].copy()
y_test = test[ycol].copy()

## LR

In [72]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [74]:
ypred = lr.predict_proba(X_test,)[:, 1]

In [77]:
roc_auc_score(y_test.values.reshape(-1,), ypred)

0.7828138843008257

In [78]:
feat_coef = dict(zip(list(X_train.columns), list(lr.coef_[0])))

## GB

In [80]:
gb = GradientBoostingClassifier()

In [81]:
gb.fit(X_train, column_or_1d(y_train))

GradientBoostingClassifier()

In [82]:
ypredgb = gb.predict_proba(X_test)

In [83]:
roc_auc_score(y_test.values.reshape(-1,), ypredgb[:,1])

0.9720453990645426

In [None]:
plt.figure(figsize=(15,10))
feat_importances = pd.Series(gb.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')

In [None]:
imp_feat = list(feat_importances.nlargest(10).index)
for col in imp_feat:
    print(col, feat_coef[col])

## RF

In [None]:
model = RandomForestClassifier(max_depth=5, n_estimators=100)

In [None]:
model.fit(X_train, y_train)

In [None]:
yprob = model.predict_proba(X_test)
ypred = model.predict(X_test)

In [None]:
roc_auc_score(y_test.values.reshape(-1,), yprob[:,1])

In [None]:
ypred1 = model.predict(X_test)
cnf_matrix = confusion_matrix(y_test.values, ypred1)
class_names = ["stay_1", "leave_1"]
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

## Catboost

In [None]:
train_pool = Pool(X_train, y_train)
m = CatBoostClassifier()
m.fit(X_train, y_train)

In [None]:
yprob = m.predict_proba(X_test)
ypred = m.predict(X_test)

In [None]:
roc_auc_score(y_test.values.reshape(-1,), yprob[:,1])

In [None]:
cnf_matrix = confusion_matrix(y_test.values, ypred)
class_names = ["stay_1", "leave_1"]
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

## LightGBM

In [None]:
import lightgbm 

In [None]:
lgbm = lightgbm.LGBMClassifier(boosting_type= 'gbdt', 
                               num_leaves=300, 
                               n_estimators=30, 
                               learning_rate=0.2, 
                               is_unbalance=True)

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
yprob = lgbm.predict_proba(X_test)
ypred = lgbm.predict(X_test)

In [None]:
roc_auc_score(y_test.values.reshape(-1,), yprob[:,1])

In [None]:
cnf_matrix = confusion_matrix(y_test.values, ypred)
class_names = ["stay_1", "leave_1"]
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

In [None]:
plt.figure(figsize=(15,5))
feat_importances = pd.Series(lgbm.feature_importances_, index=X_train.columns)
feat_importances.nlargest(20).plot(kind='barh')

In [None]:
# visit time
print("Visit time for staying in segment 1", np.mean(df.loc[df["segment"] == 0, "visit_time"]))
print("Visit time for leaving in segment 1", np.mean(df.loc[df["segment"] == 1, "visit_time"]))

In [None]:
# visit time
print("Visit time for staying in segment 1", np.std(df.loc[df["segment"] == 0, "visit_time"]))
print("Visit time for leaving in segment 1", np.std(df.loc[df["segment"] == 1, "visit_time"]))

In [None]:
# avg basket size
print("Avg basket size for staying in segment 1", np.mean(df.loc[df["segment"] == 0, "avg_basket_size"]))
print("Avg basket size for leaving in segment 1", np.mean(df.loc[df["segment"] == 1, "avg_basket_size"]))

In [None]:
# avg basket size
print("Avg basket size for staying in segment 1", np.std(df.loc[df["segment"] == 0, "avg_basket_size"]))
print("Avg basket size for leaving in segment 1", np.std(df.loc[df["segment"] == 1, "avg_basket_size"]))

In [None]:
# avg smoke basket size
print("Smoke basket size for staying in segment 1", np.mean(df.loc[df["segment"] == 0, "smoke_basket_size"]))
print("Smoke basket size for leaving in segment 1", np.mean(df.loc[df["segment"] == 1, "smoke_basket_size"]))

In [None]:
# avg smoke basket size
print("Smoke basket size for staying in segment 1", np.std(df.loc[df["segment"] == 0, "smoke_basket_size"]))
print("Smoke basket size for leaving in segment 1", np.std(df.loc[df["segment"] == 1, "smoke_basket_size"]))

In [None]:
# ecig basket size
print("Ecig basket size for staying in segment 1", np.mean(df.loc[df["segment"] == 0, "ecig_basket_size"]))
print("Ecig basket size for leaving in segment 1", np.mean(df.loc[df["segment"] == 1, "ecig_basket_size"]))

In [None]:
# ecig basket size
print("Ecig basket size for staying in segment 1", np.std(df.loc[df["segment"] == 0, "ecig_basket_size"]))
print("Ecig basket size for leaving in segment 1", np.std(df.loc[df["segment"] == 1, "ecig_basket_size"]))

In [None]:
# most common non-smoke PSA
df.loc[df["segment"] == 0, "psaname_mostCommon"].value_counts(normalize=True)[:10]

In [None]:
# most common non-smoke PSA
df.loc[df["segment"] == 1, "psaname_mostCommon"].value_counts(normalize=True)[:10]

In [None]:
# most common non-smoke category
df.loc[df["segment"] == 0, "categoryname_mostCommon"].value_counts(normalize=True)[:10]

In [None]:
# most common non-smoke category
df.loc[df["segment"] == 1, "categoryname_mostCommon"].value_counts(normalize=True)[:10]

In [None]:
# most common city
df.loc[df["segment"] == 0, "city_mostCommon"].value_counts(normalize=True)[:10]

In [None]:
# most common city
df.loc[df["segment"] == 1, "city_mostCommon"].value_counts(normalize=True)[:10]

# Evaluation

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.figure(figsize=(15,5))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
ypredgb1 = gb.predict(X_test)
cnf_matrix = confusion_matrix(y_test.values, ypredgb1)
class_names = ["stay_1", "leave_1"]
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')