# Loading all the model results in atoti & exploring them.

## 1. Reading the dataset

In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./train.csv")

In [3]:
# converting vehicle age to inter


def process_vehicule_age(df):
    ds = df.copy()
    ds["Vehicle_Age"] = ds["Vehicle_Age"].map(
        lambda x: 0 if x == "< 1 Year" else 1 if x == "1-2 Year" else 2
    )
    ds["Vehicle_Age"] = ds["Vehicle_Age"].astype("int")
    return ds

In [4]:
data = process_vehicule_age(data)

In [5]:
# changing the column type for different columns

cols_types = {
    "str": [
        "Gender",
        "Driving_License",
        "Region_Code",
        "Previously_Insured",
        "Vehicle_Damage",
        "Policy_Sales_Channel",
    ],
    "float": ["Age", "Annual_Premium", "Vehicle_Age", "Vintage"],
    "int": ["id", "Response"],
}

for k, v in cols_types.items():
    for c in v:
        data[c] = data[c].astype(k)

In [6]:
data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44.0,1,28.0,0,2.0,Yes,40454.0,26.0,217.0,1
1,2,Male,76.0,1,3.0,0,1.0,No,33536.0,26.0,183.0,0
2,3,Male,47.0,1,28.0,0,2.0,Yes,38294.0,26.0,27.0,1
3,4,Male,21.0,1,11.0,1,0.0,No,28619.0,152.0,203.0,0
4,5,Female,29.0,1,41.0,1,0.0,No,27496.0,152.0,39.0,0


In [7]:
# reshuffling columns

cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index("Response")))
data = data.reindex(columns=cols)

In [8]:
# splitting the labels from the data

y = data.iloc[:, 0]
X_raw = data.iloc[:, 1:]

In [9]:
# split X in X_train and x_test

X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.1, random_state=42
)

In [10]:
# further splitting X_train into X_train_subset (for training XGB etc.) and X_val
X_train_subset, X_val, y_train_subset, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [11]:
# printing the shapes of the new dataframes

print("Train data size: {}\n".format(X_train.shape))
print("Train label size: {}\n".format(y_train.shape))
print("Test data size: {}\n".format(X_test.shape))
print("Test label size: {}\n".format(y_test.shape))
print("Train subset data size: {}\n".format(X_train_subset.shape))
print("Train subset label size: {}\n".format(y_train_subset.shape))
print("Val data size: {}\n".format(X_val.shape))
print("Val label size: {}\n".format(y_val.shape))

Train data size: (342998, 11)

Train label size: (342998,)

Test data size: (38111, 11)

Test label size: (38111,)

Train subset data size: (308698, 11)

Train subset label size: (308698,)

Val data size: (34300, 11)

Val label size: (34300,)



### Now we are only concerned with the test data

Because we used the train and validation data for training and validating our model.

In [12]:
master_data = X_test.join(y_test)
master_data = master_data.reset_index(drop=True)

## Loading the results of the various datasets

In [13]:
# Extra tree with one hot encoding
model_results_extra_tree_OH = pd.read_csv("./model_results_extra_tree_OH.csv")
# Predictions from neural network
model_results_nn = pd.read_csv("./model_results_nn.csv")
# Predictions from Parital Least Square + Bayesian Mix Model
model_results_pls_bgm = pd.read_csv("./model_results_pls_bgm.csv")
# Predictions from Parital Least Square + XGBoost
model_results_pls_xgb = pd.read_csv("./model_results_pls_xgb.csv")
# Predictions from feature encoding + random forest
model_results_rf_glmm = pd.read_csv("./model_results_rf_glmm.csv")
# XGBoost with one hot encoding
model_results_xgb_OH = pd.read_csv("./model_results_xgb_OH.csv")
# Predictions from feature encoding + XGBoost
model_results_xgb = pd.read_csv("./model_results_xgb.csv")

In [14]:
# renaming the result comuln from various models
model_results_nn.columns = ["predicted_response"]
model_results_extra_tree_OH.columns = ["predicted_response"]
model_results_pls_bgm.columns = ["predicted_response"]
model_results_pls_xgb.columns = ["predicted_response"]
model_results_rf_glmm.columns = ["predicted_response"]
model_results_xgb_OH.columns = ["predicted_response"]
model_results_xgb.columns = ["predicted_response"]

In [15]:
master_data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,200223,Female,21.0,1,3.0,1,0.0,No,20408.0,160.0,72.0,0
1,49767,Male,55.0,1,15.0,0,1.0,Yes,37498.0,26.0,102.0,0
2,172202,Female,41.0,1,3.0,0,1.0,Yes,2630.0,26.0,90.0,0
3,160714,Female,26.0,1,11.0,0,0.0,No,2630.0,151.0,268.0,0
4,53273,Male,51.0,1,40.0,0,1.0,Yes,34006.0,124.0,265.0,0


In [16]:
# we shall be using the results from the Deep Learning as a baseline hence calling it predicted_response

master_data2 = master_data.join(model_results_nn)

master_data2["predicted_response"] = master_data2["predicted_response"].astype(int)

In [17]:
master_data2

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,predicted_response
0,200223,Female,21.0,1,3.0,1,0.0,No,20408.0,160.0,72.0,0,0
1,49767,Male,55.0,1,15.0,0,1.0,Yes,37498.0,26.0,102.0,0,1
2,172202,Female,41.0,1,3.0,0,1.0,Yes,2630.0,26.0,90.0,0,1
3,160714,Female,26.0,1,11.0,0,0.0,No,2630.0,151.0,268.0,0,0
4,53273,Male,51.0,1,40.0,0,1.0,Yes,34006.0,124.0,265.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38106,210207,Female,42.0,1,28.0,0,1.0,Yes,47708.0,26.0,298.0,0,1
38107,220715,Female,20.0,1,3.0,0,0.0,Yes,77251.0,160.0,278.0,0,0
38108,250135,Male,22.0,1,12.0,0,0.0,Yes,2630.0,152.0,33.0,0,0
38109,248993,Male,72.0,1,48.0,0,1.0,Yes,2630.0,15.0,121.0,0,0


# calculate the F1 and AUC for all the models, and print it here.

In [18]:
master_data2.dtypes

id                        int32
Gender                   object
Age                     float64
Driving_License          object
Region_Code              object
Previously_Insured       object
Vehicle_Age             float64
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel     object
Vintage                 float64
Response                  int32
predicted_response        int32
dtype: object

# Loading this data into atoti

In [19]:
import atoti as tt

Welcome to atoti 0.5.1!

By using this community edition, you agree with the license available at https://www.atoti.io/eula.
Browse the official documentation at https://docs.atoti.io.
Join the community at https://www.atoti.io/register.

You can hide this message by setting the ATOTI_HIDE_EULA_MESSAGE environment variable to True.


In [20]:
# creating a session - it spins up an in-memory database - similar to Apache Spark - ready to slice’n’dice your big data set.
# In addition to that, it launches a dashboarding Tableau-like web-app

from atoti.config import create_config

config = create_config(metadata_db="./metadata.db")
session = tt.create_session(config=config)

# data dictionary

**Gender** -  Gender of the customer

**Age** -  Age of the customer

**Driving_License** - 0 : Customer does not have DL, 1 : Customer already has DL

**Region_Code** - Unique code for the region of the customer

**Previously_Insured** - 1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance

**Vehicle_Age** - Age of the Vehicle in years

**Vehicle_Damage** - 1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.

**Annual_Premium** - The amount customer needs to pay as premium in the year

**PolicySalesChannel** - Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.

**Vintage** - Number of Days, Customer has been associated with the company

**Response** - 1 : Customer is interested, 0 : Customer is not interested

# Column name and corresponding model dictionary:

 **'extra_tree'** - Extra tree classifier after one hot coding on raw data
 
 **'nn_test'** - Deep learning model built using ktrain
 
 **'pls_bgm'** - One hot coding  then Parital Least Square and then Bayesian Mix Model on top of that
 
 **'pls_xgb'** - One hot coding  then Parital Least Square and then XGBoost on top.
 
 **'rf_glmm'** - Generalized Linear Mixed Model Encoder then feature encoding and random forest classifier to make predictions
 
 **'xgb_oh'** - One hot coding then XGBoost on top.
 
 **'xgb_glmm'** _ Generalized Linear Mixed Model Encoder then feature encoding and XgBoost to make predictions

In [21]:
customer_store = session.read_pandas(
    master_data2,
    store_name="customer_store",
    types={"Policy_Sales_Channel": tt.type.STRING, "Region_Code": tt.type.STRING},
    keys=["id"],
)
customer_store.head()

The store has been sampled because there are more than 10000 lines in the files to load. Call Session.load_all_data() to trigger the full load of the data.


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,predicted_response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
200223,Female,21.0,1,3.0,1,0.0,No,20408.0,160.0,72.0,0,0
49767,Male,55.0,1,15.0,0,1.0,Yes,37498.0,26.0,102.0,0,1
172202,Female,41.0,1,3.0,0,1.0,Yes,2630.0,26.0,90.0,0,1
160714,Female,26.0,1,11.0,0,0.0,No,2630.0,151.0,268.0,0,0
53273,Male,51.0,1,40.0,0,1.0,Yes,34006.0,124.0,265.0,0,1


In [22]:
# creating a cube from this store

cube = session.create_cube(customer_store, "model_cube")

h = cube.hierarchies
m = cube.measures
l = cube.levels
cube

We create a simple function that helps change the dimension of our hierarchies as we group them into logical categories.

In [23]:
def set_dim(hier_name, dim_name):
    h[hier_name].dimension = dim_name

In [24]:
customer_hierarchy = ["id", "Response", "predicted_response"]
[set_dim(hier_name, "Customer") for hier_name in customer_hierarchy]
cube

# Model F1-score
Let's look at the F1-score of the algorithm as we compute the number of correctly predicted churn cases in the pivot table below.
Looking at just the customers who churned, we see that SVM Classifier has the highest percentage of correct prediction (Recall of 0.76), with 73 churns detected out of the 96.
But, at the same time, SVM Classifier is the one with the most false positive (Precision of 0.53). Which means that 47% of the time it is predicting churn wrongly!

As a consequence, we will focus on the F1-score to compare our classifiers in the following paragraphs, as it takes into account Precision and Recall at the same time.

In [25]:
customer_store.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,predicted_response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
181713,Male,60.0,1,11.0,1,1.0,No,45324.0,124.0,233.0,0,0
285193,Female,31.0,1,46.0,1,0.0,No,32639.0,152.0,62.0,0,0
249394,Male,71.0,1,39.0,1,1.0,No,2630.0,13.0,250.0,0,0
351908,Female,44.0,1,28.0,0,1.0,Yes,58818.0,124.0,43.0,0,1
254637,Male,24.0,1,8.0,1,0.0,No,46899.0,152.0,197.0,0,0


In [26]:
session.load_all_data()

In [27]:
m["true positive"] = tt.agg.sum(
    tt.where(((l["predicted_response"] == 1) & (l["Response"] == 1)), 1, 0),
    scope=tt.scope.origin(l["id"]),
)

m["true negative"] = tt.agg.sum(
    tt.where(((l["predicted_response"] == 0) & (l["Response"] == 0)), 1, 0),
    scope=tt.scope.origin(l["id"]),
)

m["false positive"] = tt.agg.sum(
    tt.where(((l["predicted_response"] == 1) & (l["Response"] == 0)), 1, 0),
    scope=tt.scope.origin(l["id"]),
)


m["false negative"] = tt.agg.sum(
    tt.where(((l["predicted_response"] == 0) & (l["Response"] == 1)), 1, 0),
    scope=tt.scope.origin(l["id"]),
)

What proportion of positive identifications was actually correct? Precision is defined as follows:

In [28]:
m["precision"] = m["true positive"] / (m["true positive"] + m["false positive"])

What proportion of actual positives was identified correctly? Recall, also known as True Positive Rate (TPR), is defined as follows:

In [29]:
m["recall"] = m["true positive"] / (m["true positive"] + m["false negative"])

In [30]:
session.visualize("Confusion matrix")

In [31]:
m["accuracy score"] = (m["true positive"] + m["true negative"]) / m[
    "contributors.COUNT"
]

F1 score conveys the balance between the precision and the recall. It is defined as follows:  
__F1 Score = 2*((precision*recall)/(precision+recall))__

In [32]:
m["f1 score"] = 2 * ((m["recall"] * m["precision"]) / (m["recall"] + m["precision"]))

Specificity: When the actual value is negative, how often is the prediction correct? It is defined by  
__specificity = TN / (TN + FP)__

False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
false_positive_rate = FP / float(TN + FP)

In [33]:
m["false positive rate"] = m["false positive"] / (
    m["true negative"] + m["false positive"]
)

In [34]:
cube.query(
    m["accuracy score"],
    m["precision"],
    m["recall"],
    m["f1 score"],
    m["false positive rate"],
)

Unnamed: 0,accuracy score,precision,recall,f1 score,false positive rate
0,0.76,0.3,0.66,0.41,0.23


In [35]:
print(
    "accuracy score: ",
    metrics.accuracy_score(
        master_data2["Response"], master_data2["predicted_response"]
    ),
)
print(
    "precision score: ",
    metrics.precision_score(
        master_data2["Response"], master_data2["predicted_response"]
    ),
)
print(
    "recall score: ",
    metrics.recall_score(master_data2["Response"], master_data2["predicted_response"]),
)
print(
    "F1 score: ",
    metrics.f1_score(master_data2["Response"], master_data2["predicted_response"]),
)

accuracy score:  0.760331662774527
precision score:  0.295670141823988
recall score:  0.6581643320091992
F1 score:  0.40803629293583926


In [36]:
# https://stackoverflow.com/questions/50848163/manually-calculate-auc
# 1/2 - FPR/2 + TPR/2
m["AUC"] = 0.5 - (m["false positive rate"] / 2) + (m["recall"] / 2)

In [37]:
cube.query(m["AUC"])

Unnamed: 0,AUC
0,0.72


In [38]:
print(
    "AUC: ",
    metrics.roc_auc_score(master_data2["Response"], master_data2["predicted_response"]),
)

AUC:  0.7165791655245228


In [39]:
session.visualize()

In [40]:
session.visualize()

In [41]:
master_data

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,200223,Female,21.0,1,3.0,1,0.0,No,20408.0,160.0,72.0,0
1,49767,Male,55.0,1,15.0,0,1.0,Yes,37498.0,26.0,102.0,0
2,172202,Female,41.0,1,3.0,0,1.0,Yes,2630.0,26.0,90.0,0
3,160714,Female,26.0,1,11.0,0,0.0,No,2630.0,151.0,268.0,0
4,53273,Male,51.0,1,40.0,0,1.0,Yes,34006.0,124.0,265.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
38106,210207,Female,42.0,1,28.0,0,1.0,Yes,47708.0,26.0,298.0,0
38107,220715,Female,20.0,1,3.0,0,0.0,Yes,77251.0,160.0,278.0,0
38108,250135,Male,22.0,1,12.0,0,0.0,Yes,2630.0,152.0,33.0,0
38109,248993,Male,72.0,1,48.0,0,1.0,Yes,2630.0,15.0,121.0,0


In [42]:
# 1. model_results_extra_tree_OH
model_results_extra_tree_OH = master_data.join(model_results_extra_tree_OH)
model_results_extra_tree_OH["predicted_response"] = model_results_extra_tree_OH[
    "predicted_response"
].astype(int)

In [43]:
model_results_extra_tree_OH

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,predicted_response
0,200223,Female,21.0,1,3.0,1,0.0,No,20408.0,160.0,72.0,0,0
1,49767,Male,55.0,1,15.0,0,1.0,Yes,37498.0,26.0,102.0,0,0
2,172202,Female,41.0,1,3.0,0,1.0,Yes,2630.0,26.0,90.0,0,1
3,160714,Female,26.0,1,11.0,0,0.0,No,2630.0,151.0,268.0,0,0
4,53273,Male,51.0,1,40.0,0,1.0,Yes,34006.0,124.0,265.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38106,210207,Female,42.0,1,28.0,0,1.0,Yes,47708.0,26.0,298.0,0,1
38107,220715,Female,20.0,1,3.0,0,0.0,Yes,77251.0,160.0,278.0,0,0
38108,250135,Male,22.0,1,12.0,0,0.0,Yes,2630.0,152.0,33.0,0,0
38109,248993,Male,72.0,1,48.0,0,1.0,Yes,2630.0,15.0,121.0,0,0


In [44]:
customer_store.scenarios["Extra tree"].load_pandas(model_results_extra_tree_OH)

In [45]:
session.visualize()

In [46]:
x = cube.query(
    m["contributors.COUNT"],
    m["true negative"],
    m["true positive"],
    m["false negative"],
    m["true negative"],
    m["false positive rate"],
    levels=[l["id"], l["Response"], l["predicted_response"]],
)

In [47]:
y = cube.query(
    m["contributors.COUNT"],
    m["true negative"],
    m["true positive"],
    m["false negative"],
    m["true negative"],
    m["false positive rate"],
    levels=[l["id"], l["Response"], l["predicted_response"]],
    scenario="Extra tree",
)

In [48]:
# creating the database for the other scenarios.

# 2. model_results_pls_bgm
model_results_pls_bgm = master_data.join(model_results_pls_bgm)
model_results_pls_bgm["predicted_response"] = model_results_pls_bgm[
    "predicted_response"
].astype(int)

# 3. model_results_pls_xgb
model_results_pls_xgb = master_data.join(model_results_pls_xgb)
model_results_pls_xgb["predicted_response"] = model_results_pls_xgb[
    "predicted_response"
].astype(int)

# 4. model_results_rf_glmm
model_results_rf_glmm = master_data.join(model_results_rf_glmm)
model_results_rf_glmm["predicted_response"] = model_results_rf_glmm[
    "predicted_response"
].astype(int)

# 5. model_results_rf_glmm
model_results_xgb_OH = master_data.join(model_results_xgb_OH)
model_results_xgb_OH["predicted_response"] = model_results_xgb_OH[
    "predicted_response"
].astype(int)

# 6. model_results_rf_glmm
model_results_xgb = master_data.join(model_results_xgb)
model_results_xgb["predicted_response"] = model_results_xgb[
    "predicted_response"
].astype(int)

In [49]:
# now we can load the various other scenarios, since we have checked the first scenario works well.

customer_store.scenarios["PLS BGM"].load_pandas(model_results_pls_bgm)
customer_store.scenarios["PLS XGB"].load_pandas(model_results_pls_xgb)
customer_store.scenarios["GLMM RF"].load_pandas(model_results_rf_glmm)
customer_store.scenarios["OH XGB"].load_pandas(model_results_xgb_OH)
customer_store.scenarios["GLMM XGB"].load_pandas(model_results_xgb)

In [50]:
session.visualize()

In [51]:
# F1 and AUC from sklearn

print("Base")
print(
    "F1:",
    f1_score(master_data2["Response"], master_data2["predicted_response"]),
    "AUC:",
    roc_auc_score(master_data2["Response"], master_data2["predicted_response"]),
    " \n",
)

print("Extra Tree")
print(
    "F1:",
    f1_score(
        model_results_extra_tree_OH["Response"],
        model_results_extra_tree_OH["predicted_response"],
    ),
    "AUC:",
    roc_auc_score(
        model_results_extra_tree_OH["Response"],
        model_results_extra_tree_OH["predicted_response"],
    ),
    " \n",
)

print("GLMM RF")
print(
    "F1:",
    f1_score(
        model_results_rf_glmm["Response"], model_results_rf_glmm["predicted_response"]
    ),
    "AUC:",
    roc_auc_score(
        model_results_rf_glmm["Response"], model_results_rf_glmm["predicted_response"]
    ),
    " \n",
)

print("GLMM XGB")
print(
    "F1:",
    f1_score(model_results_xgb["Response"], model_results_xgb["predicted_response"]),
    "AUC:",
    roc_auc_score(
        model_results_xgb["Response"], model_results_xgb["predicted_response"]
    ),
    " \n",
)

print("OH XGB")
print(
    "F1:",
    f1_score(
        model_results_xgb_OH["Response"], model_results_xgb_OH["predicted_response"]
    ),
    "AUC:",
    roc_auc_score(
        model_results_xgb_OH["Response"], model_results_xgb_OH["predicted_response"]
    ),
    " \n",
)

print("PLS BGM")
print(
    "F1:",
    f1_score(
        model_results_pls_bgm["Response"], model_results_pls_bgm["predicted_response"]
    ),
    "AUC:",
    roc_auc_score(
        model_results_pls_bgm["Response"], model_results_pls_bgm["predicted_response"]
    ),
    " \n",
)

print("PLS XGB")
print(
    "F1:",
    f1_score(
        model_results_pls_xgb["Response"], model_results_pls_xgb["predicted_response"]
    ),
    "AUC:",
    roc_auc_score(
        model_results_pls_xgb["Response"], model_results_pls_xgb["predicted_response"]
    ),
)

Base
F1: 0.40803629293583926 AUC: 0.7165791655245228  

Extra Tree
F1: 0.2391274333839569 AUC: 0.5666151305095288  

GLMM RF
F1: 0.17178304437728645 AUC: 0.5419876128539337  

GLMM XGB
F1: 0.2911420366688756 AUC: 0.5934775478774491  

OH XGB
F1: 0.2843464707192162 AUC: 0.5896566093267793  

PLS BGM
F1: 0.3788514566598729 AUC: 0.7629677072814193  

PLS XGB
F1: 0.2790268649832814 AUC: 0.5862832178174643


In [53]:
session.visualize("Slicing Data on Gender")

In [54]:
session.visualize("Slicing Data on Driving License")

In [55]:
# age group buckets
age_groups_store = session.read_pandas(
    pd.DataFrame(
        data=[("0-30Y", i) for i in range(30)]
        + [("30Y - 40Y", i) for i in range(30, 40)]
        + [("40Y - 50Y", i) for i in range(40, 50)]
        + [("50Y+", i) for i in range(50, 200)],
        columns=["age group", "age"],
    ),
    keys=["age"],
    store_name="Age Groups",
)

customer_store.join(age_groups_store)

# annual premium buckets

premium_store = session.read_pandas(
    pd.DataFrame(
        data=[("0 - 10030", i) for i in range(10030)]
        + [("10030 - 24258", i) for i in range(2630, 24258)]
        + [("24258 - 31560", i) for i in range(24258, 31560)]
        + [("31560 - 39310", i) for i in range(31560, 39310)]
        + [("39310 - 55098", i) for i in range(39310, 55098)]
        + [("55098+", i) for i in range(55098, 448156)],
        columns=["Annual_Premium group", "Annual_Premium"],
    ),
    keys=["Annual_Premium"],
    store_name="Premium Groups",
)

customer_store.join(premium_store)

In [56]:
session.visualize("Slicing Data on Age")

# this slicing in particular, does not seem to male a difference.

In [57]:
session.visualize("Slicing Data on Premium")

# *********** The End **************