# Insurance Claim Prediciton

In this notbook:
- loading data
- data prepration
- building feautre store
- building training set from FS
- build Pure Premium Modeling using AWS xgboost algo
- differnt ways of inference

What we can predict in this dataset?
1. __Claim Amount:__ total claims amount per policy holder.
1. __Claim Frequency:__ Number of claims per policy holder per exposure unit `Claim Frequency = Claim Count / Exposure`.
1. __Claim Severity:__ the average claim amount per claim for each policy holder per exposure unit `Claim Severity = Claim Cost / Claim Frequency`.
1. __Avg Claim amount:__ `Avg Claim amount = Claim Amount / Claim Count`
1. __Loss Cost:__ `Loss Cost = Claim Frequency x Claim Severity`
1. __Pure Premium:__ the mean of the total claim amount per exposure unit (the average loss per exposure) `PurePremium  = Claim Amount / Exposure`.

In [1]:
# !conda update scikit-learn -y
!pip install -U scikit-learn



In [2]:
import sklearn
sklearn.__version__ 

'0.24.2'

In [3]:
print(__doc__)

from functools import partial

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import GammaRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import mean_tweedie_deviance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

from sklearn.metrics import mean_absolute_error, mean_squared_error, auc

Automatically created module for IPython interactive environment


We have 6780013 individual car insurance policies and for each policy we have 12 variables
1. IDpol: policy number (unique identifier);
2. ClaimNb: number of claims on the given policy;
3. Exposure: total exposure in yearly units;
4. Area: area code (categorical, ordinal);
5. VehPower: power of the car (categorical, ordinal);
6. VehAge: age of the car in years;
7. DrivAge: age of the (most common) driver in years;
8. BonusMalus: bonus-malus level between 50 and 230 (with reference level 100);
9. VehBrand: car brand (categorical, nominal);
10. VehGas: diesel or regular fuel car (binary);
11. Density: density of inhabitants per km2
in the city of the living place of the driver;
12. Region: regions in France (prior to 2016), these are illustrated in Figure 1 (categorical).

In [4]:
df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
df_freq.head(10)

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1.0,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
1,3.0,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
2,5.0,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22
3,10.0,1.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
4,11.0,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
5,13.0,1.0,0.52,E,6.0,2.0,38.0,50.0,B12,Regular,3003.0,R31
6,15.0,1.0,0.45,E,6.0,2.0,38.0,50.0,B12,Regular,3003.0,R31
7,17.0,1.0,0.27,C,7.0,0.0,33.0,68.0,B12,Diesel,137.0,R91
8,18.0,1.0,0.71,C,7.0,0.0,33.0,68.0,B12,Diesel,137.0,R91
9,21.0,1.0,0.15,B,7.0,0.0,41.0,50.0,B12,Diesel,60.0,R52


In [5]:
df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
df_sev.head()

Unnamed: 0,IDpol,ClaimAmount
0,1552.0,995.2
1,1010996.0,1128.12
2,4024277.0,1851.11
3,4007252.0,1204.0
4,4046424.0,1204.0


In [6]:
def load_mtpl2(n_samples=100000):
    """Fetch the French Motor Third-Party Liability Claims dataset.

    Parameters
    ----------
    n_samples: int, default=100000
      number of samples to select (for faster run time). Full dataset has
      678013 samples.
    """
    # freMTPL2freq dataset from https://www.openml.org/d/41214
    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
    df_freq.set_index('IDpol', inplace=True)

    # freMTPL2sev dataset from https://www.openml.org/d/41215
    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']

    # sum ClaimAmount over identical IDs
    df_sev = df_sev.groupby('IDpol').sum()

    df = df_freq.join(df_sev, how="left")
    df["ClaimAmount"].fillna(0, inplace=True)

    # unquote string fields
    for column_name in df.columns[df.dtypes.values == np.object]:
        df[column_name] = df[column_name].str.strip("'")
    return df.iloc[:n_samples]

### Loading datasets, basic feature extraction and target definitions¶
We construct the freMTPL2 dataset by joining the freMTPL2freq table, containing the number of claims (ClaimNb), with the freMTPL2sev table, containing the claim amount (ClaimAmount) for the same policy ids (IDpol).

In [7]:
df = load_mtpl2(n_samples=60000)

# Note: filter out claims with zero amount, as the severity model
# requires strictly positive target values.
df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
df.head()

Unnamed: 0_level_0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount
IDpol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,0.0
3,0.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,0.0
5,0.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22,0.0
10,0.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,0.0
11,0.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,0.0


In [8]:
# Correct for unreasonable observations (that might be data error)
# and a few exceptionally large claim amounts
df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
df["Exposure"] = df["Exposure"].clip(upper=1)
df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)


In [9]:
df.reset_index(inplace=True)

In [10]:
df.shape

(60000, 13)

In [11]:
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount
0,1,0.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,0.0
1,3,0.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,0.0
2,5,0.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22,0.0
3,10,0.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,0.0
4,11,0.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,0.0


In [12]:
log_scale_transformer = make_pipeline(
    FunctionTransformer(func=np.log),
    StandardScaler()
)

column_trans = ColumnTransformer(
    [
        ("binned_numeric", KBinsDiscretizer(n_bins=3),["VehAge", "DrivAge"]),
        ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
        ("passthrough_numeric", "passthrough",["IDpol","VehAge", "DrivAge","VehBrand", "VehPower", "VehGas", "Region", "Area", "ClaimNb","Exposure", "BonusMalus", "ClaimAmount"]),
    ],
    remainder="drop",
    sparse_threshold=0.0,

)
X = column_trans.fit_transform(df)
X.shape

(60000, 72)

In [13]:
print(X[0,:])

[1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 1.0 0.0 0.0 0.6971986019415635 1 0.0 55.0 'B12' 5.0 'Regular'
 'R82' 'D' 0.0 0.1 50.0 0.0]


In [13]:
bins=[]
for j,f in enumerate(["VehAge","DrivAge"]):
    for idx, val in enumerate(column_trans.transformers_[0][1].bin_edges_[j]):
        edge1= column_trans.transformers_[0][1].bin_edges_[j][idx-1]
        if idx>0:
            bins.append(f+"_bin"+str(edge1)+"_"+str(val))
column_trans.transformers_[1][1].get_feature_names(["VehBrand", "VehPower", "VehGas", "Region", "Area"])
feature_names = bins+\
    column_trans.transformers_[1][1].get_feature_names(["VehBrand", "VehPower", "VehGas", "Region", "Area"]).tolist()+\
    ["Density"]+\
    ["IDpol","VehAge", "DrivAge","VehBrand", "VehPower", "VehGas", "Region", "Area","ClaimNb","Exposure","BonusMalus","ClaimAmount"]


len(feature_names)

72

In [14]:
X.shape

(60000, 72)

In [15]:
feature_names = [x.replace('.','_') for x in feature_names]
feature_names

['VehAge_bin0_0_4_0',
 'VehAge_bin4_0_10_0',
 'VehAge_bin10_0_100_0',
 'DrivAge_bin18_0_36_0',
 'DrivAge_bin36_0_50_0',
 'DrivAge_bin50_0_99_0',
 'VehBrand_B1',
 'VehBrand_B10',
 'VehBrand_B11',
 'VehBrand_B12',
 'VehBrand_B13',
 'VehBrand_B14',
 'VehBrand_B2',
 'VehBrand_B3',
 'VehBrand_B4',
 'VehBrand_B5',
 'VehBrand_B6',
 'VehPower_4_0',
 'VehPower_5_0',
 'VehPower_6_0',
 'VehPower_7_0',
 'VehPower_8_0',
 'VehPower_9_0',
 'VehPower_10_0',
 'VehPower_11_0',
 'VehPower_12_0',
 'VehPower_13_0',
 'VehPower_14_0',
 'VehPower_15_0',
 'VehGas_Diesel',
 'VehGas_Regular',
 'Region_R11',
 'Region_R21',
 'Region_R22',
 'Region_R23',
 'Region_R24',
 'Region_R25',
 'Region_R26',
 'Region_R31',
 'Region_R41',
 'Region_R42',
 'Region_R43',
 'Region_R52',
 'Region_R53',
 'Region_R54',
 'Region_R72',
 'Region_R73',
 'Region_R74',
 'Region_R82',
 'Region_R83',
 'Region_R91',
 'Region_R93',
 'Region_R94',
 'Area_A',
 'Area_B',
 'Area_C',
 'Area_D',
 'Area_E',
 'Area_F',
 'Density',
 'IDpol',
 'VehAge'

In [16]:
df_transformed = pd.DataFrame(data=X, columns= feature_names)
#df_transformed.columns= feature_names
df_transformed.head()

Unnamed: 0,VehAge_bin0_0_4_0,VehAge_bin4_0_10_0,VehAge_bin10_0_100_0,DrivAge_bin18_0_36_0,DrivAge_bin36_0_50_0,DrivAge_bin50_0_99_0,VehBrand_B1,VehBrand_B10,VehBrand_B11,VehBrand_B12,...,DrivAge,VehBrand,VehPower,VehGas,Region,Area,ClaimNb,Exposure,BonusMalus,ClaimAmount
0,1,0,0,0,0,1,0,0,0,1,...,55,B12,5,Regular,R82,D,0,0.1,50,0
1,1,0,0,0,0,1,0,0,0,1,...,55,B12,5,Regular,R82,D,0,0.77,50,0
2,1,0,0,0,0,1,0,0,0,1,...,52,B12,6,Diesel,R22,B,0,0.75,50,0
3,1,0,0,0,1,0,0,0,0,1,...,46,B12,7,Diesel,R72,B,0,0.09,50,0
4,1,0,0,0,1,0,0,0,0,1,...,46,B12,7,Diesel,R72,B,0,0.84,50,0


Insurances companies are interested in modeling the Pure Premium, that is the expected total claim amount per unit of exposure for each policyholder in their portfolio:

In [17]:
df_transformed["PurePremium"] = df_transformed["ClaimAmount"] / df_transformed["Exposure"]

This can be indirectly approximated by a 2-step modeling: the product of the Frequency times the average claim amount per claim:

In [18]:
df_transformed["Frequency"] = df_transformed["ClaimNb"] / df_transformed["Exposure"]
df_transformed["AvgClaimAmount"] = df_transformed["ClaimAmount"] / np.fmax(df_transformed["ClaimNb"], 1)

In [19]:
df_transformed[df_transformed.ClaimNb>0].head(20)

Unnamed: 0,VehAge_bin0_0_4_0,VehAge_bin4_0_10_0,VehAge_bin10_0_100_0,DrivAge_bin18_0_36_0,DrivAge_bin36_0_50_0,DrivAge_bin50_0_99_0,VehBrand_B1,VehBrand_B10,VehBrand_B11,VehBrand_B12,...,VehGas,Region,Area,ClaimNb,Exposure,BonusMalus,ClaimAmount,PurePremium,Frequency,AvgClaimAmount
66,1,0,0,0,0,1,0,0,0,1,...,Regular,R11,F,1,0.75,50,303.0,404.0,1.33333,303.0
93,0,1,0,0,0,1,0,0,0,1,...,Diesel,R25,B,1,0.14,60,1981.84,14156.0,7.14286,1981.84
199,1,0,0,0,1,0,0,0,0,1,...,Regular,R11,E,1,0.14,85,1456.55,10403.9,7.14286,1456.55
205,1,0,0,0,0,1,0,0,0,1,...,Regular,R11,F,2,0.62,100,10834.0,17474.2,3.22581,5417.0
223,1,0,0,0,1,0,0,0,0,1,...,Regular,R73,A,1,0.31,50,3986.67,12860.2,3.22581,3986.67
287,0,1,0,0,0,1,0,0,0,1,...,Diesel,R93,D,1,0.84,50,1840.14,2190.64,1.19048,1840.14
295,1,0,0,1,0,0,0,0,0,1,...,Regular,R31,D,1,0.75,64,1397.97,1863.96,1.33333,1397.97
388,1,0,0,0,1,0,0,0,0,1,...,Regular,R93,E,1,0.76,50,971.98,1278.92,1.31579,971.98
396,0,0,1,1,0,0,1,0,0,0,...,Regular,R11,E,1,0.68,105,1442.75,2121.69,1.47059,1442.75
468,1,0,0,0,0,1,0,0,0,1,...,Regular,R93,D,1,0.73,50,637.41,873.164,1.36986,637.41


## Feature store

In [20]:
df_transformed.dtypes

VehAge_bin0_0_4_0       object
VehAge_bin4_0_10_0      object
VehAge_bin10_0_100_0    object
DrivAge_bin18_0_36_0    object
DrivAge_bin36_0_50_0    object
                         ...  
BonusMalus              object
ClaimAmount             object
PurePremium             object
Frequency               object
AvgClaimAmount          object
Length: 75, dtype: object

### 1- Define dataset

In [21]:
df_data = df_transformed.copy()

In [23]:
#df_data.reset_index(inplace=True)

In [22]:
import boto3
import sagemaker
from sagemaker.session import Session


region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

#### S3 Bucket Setup For The OfflineStore

SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 bucket owned by you. To be able to write to your S3 bucket, SageMaker FeatureStore assumes an IAM role which has access to it. The role is also owned by you.
Note that the same bucket can be re-used across FeatureGroups. Data in the bucket is partitioned by FeatureGroup.

Set the default s3 bucket name and it will be referenced throughout the notebook.

In [23]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = 'sagemaker-featurestore-insurance'

print(default_s3_bucket_name)

sagemaker-us-east-2-115272120974


Set up the IAM role. This role gives SageMaker FeatureStore access to your S3 bucket. 

<div class="alert alert-block alert-warning">
<b>Note:</b> In this example we use the default SageMaker role, assuming it has both <b>AmazonSageMakerFullAccess</b> and <b>AmazonSageMakerFeatureStoreAccess</b> managed policies. If not, please make sure to attach them to the role before proceeding.
</div>

In [24]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print (role)

arn:aws:iam::115272120974:role/service-role/AmazonSageMaker-ExecutionRole-20210503T135106


### 2- Define Feature Group

In [25]:
from time import gmtime, strftime, sleep

insurance_policy_feature_group_name = 'insurance-policy-feature-group-' + strftime('%d-%H-%M-%S', gmtime())

In [26]:
from sagemaker.feature_store.feature_group import FeatureGroup

insurance_policy_feature_group = FeatureGroup(name=insurance_policy_feature_group_name, sagemaker_session=feature_store_session)

In [27]:
import time

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label].name in ['object','category']:
            data_frame[label] = data_frame[label].astype("str").astype("string")

# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(df_data)



In [28]:
df_data.dtypes

VehAge_bin0_0_4_0       string
VehAge_bin4_0_10_0      string
VehAge_bin10_0_100_0    string
DrivAge_bin18_0_36_0    string
DrivAge_bin36_0_50_0    string
                         ...  
BonusMalus              string
ClaimAmount             string
PurePremium             string
Frequency               string
AvgClaimAmount          string
Length: 75, dtype: object

In [29]:
# record identifier and event time feature names
record_identifier_feature_name = "IDpol"
event_time_feature_name = "EventTime"

# append EventTime feature
df_data[event_time_feature_name] = pd.Series([current_time_sec]*len(df_data), dtype="float64")

In [30]:
df_data.head()

Unnamed: 0,VehAge_bin0_0_4_0,VehAge_bin4_0_10_0,VehAge_bin10_0_100_0,DrivAge_bin18_0_36_0,DrivAge_bin36_0_50_0,DrivAge_bin50_0_99_0,VehBrand_B1,VehBrand_B10,VehBrand_B11,VehBrand_B12,...,Region,Area,ClaimNb,Exposure,BonusMalus,ClaimAmount,PurePremium,Frequency,AvgClaimAmount,EventTime
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,R82,D,0.0,0.1,50.0,0.0,0.0,0.0,0.0,1622726000.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,R82,D,0.0,0.77,50.0,0.0,0.0,0.0,0.0,1622726000.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,R22,B,0.0,0.75,50.0,0.0,0.0,0.0,0.0,1622726000.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,R72,B,0.0,0.09,50.0,0.0,0.0,0.0,0.0,1622726000.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,R72,B,0.0,0.84,50.0,0.0,0.0,0.0,0.0,1622726000.0


In [31]:

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
insurance_policy_feature_group.load_feature_definitions(data_frame=df_data); # output is suppressed


#### 3- Deploy FeatureGroups in SageMaker FeatureStore

In [32]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

insurance_policy_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=insurance_policy_feature_group)


Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup insurance-policy-feature-group-03-13-08-26 successfully created.


In [33]:
insurance_policy_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-2:115272120974:feature-group/insurance-policy-feature-group-03-13-08-26',
 'FeatureGroupName': 'insurance-policy-feature-group-03-13-08-26',
 'RecordIdentifierFeatureName': 'IDpol',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'VehAge_bin0_0_4_0',
   'FeatureType': 'String'},
  {'FeatureName': 'VehAge_bin4_0_10_0', 'FeatureType': 'String'},
  {'FeatureName': 'VehAge_bin10_0_100_0', 'FeatureType': 'String'},
  {'FeatureName': 'DrivAge_bin18_0_36_0', 'FeatureType': 'String'},
  {'FeatureName': 'DrivAge_bin36_0_50_0', 'FeatureType': 'String'},
  {'FeatureName': 'DrivAge_bin50_0_99_0', 'FeatureType': 'String'},
  {'FeatureName': 'VehBrand_B1', 'FeatureType': 'String'},
  {'FeatureName': 'VehBrand_B10', 'FeatureType': 'String'},
  {'FeatureName': 'VehBrand_B11', 'FeatureType': 'String'},
  {'FeatureName': 'VehBrand_B12', 'FeatureType': 'String'},
  {'FeatureName': 'VehBrand_B13', 'FeatureType': 'String'},
  {'Fe

In [36]:
#sagemaker_client.list_feature_groups()

In [36]:
!aws s3 ls s3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/115272120974/sagemaker/us-east-2/offline-store/

                           PRE insurance-policy-feature-group-02-19-18-41-1622661562/
                           PRE insurance-policy-feature-group-03-13-08-26-1622725747/
                           PRE insurance-policy-feature-group-28-00-40-28-1622162458/
                           PRE insurance-policy-feature-group-28-14-12-16-1622211163/
                           PRE insurance-policy-feature-group-28-14-37-40-1622212678/
                           PRE insurance-policy-feature-group-28-15-47-20-1622216861/
                           PRE insurance-policy-feature-group-28-15-54-22-1622217274/


#### 4- Ingest data into FeatureStore

#### PutRecords into FeatureGroup

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.

In [37]:
insurance_policy_feature_group.ingest(
    data_frame=df_data, max_workers=5, wait=True
)

IngestionManagerPandas(feature_group_name='insurance-policy-feature-group-03-13-08-26', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f129e7cf5c0>, max_workers=5, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f126f9e8dd8>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

To confirm that data has been ingested, we can quickly retrieve a record from the online store:

In [37]:
!aws s3 ls s3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/115272120974/sagemaker/us-east-1/offline-store/insurance_policy_feature_group

In [38]:
insurance_policy_feature_group

FeatureGroup(name='insurance-policy-feature-group-03-13-08-26', sagemaker_session=<sagemaker.session.Session object at 0x7f129ebe45f8>, feature_definitions=[FeatureDefinition(feature_name='VehAge_bin0_0_4_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='VehAge_bin4_0_10_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='VehAge_bin10_0_100_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='DrivAge_bin18_0_36_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='DrivAge_bin36_0_50_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='DrivAge_bin50_0_99_0', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='VehBrand_B1', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='VehBrand_B10', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature

In [39]:
record_identifier_value = str(15)

#featurestore_runtime.get_record(FeatureGroupName=insurance_policy_feature_group, RecordIdentifierValueAsString=record_identifier_value)

The SageMaker Python SDK’s FeatureStore class also provides the functionality to generate Hive DDL commands. Schema of the table is generated based on the feature definitions. Columns are named after feature name and data-type are inferred based on feature type.

In [40]:
print(insurance_policy_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.insurance-policy-feature-group-03-13-08-26 (
  VehAge_bin0_0_4_0 STRING
  VehAge_bin4_0_10_0 STRING
  VehAge_bin10_0_100_0 STRING
  DrivAge_bin18_0_36_0 STRING
  DrivAge_bin36_0_50_0 STRING
  DrivAge_bin50_0_99_0 STRING
  VehBrand_B1 STRING
  VehBrand_B10 STRING
  VehBrand_B11 STRING
  VehBrand_B12 STRING
  VehBrand_B13 STRING
  VehBrand_B14 STRING
  VehBrand_B2 STRING
  VehBrand_B3 STRING
  VehBrand_B4 STRING
  VehBrand_B5 STRING
  VehBrand_B6 STRING
  VehPower_4_0 STRING
  VehPower_5_0 STRING
  VehPower_6_0 STRING
  VehPower_7_0 STRING
  VehPower_8_0 STRING
  VehPower_9_0 STRING
  VehPower_10_0 STRING
  VehPower_11_0 STRING
  VehPower_12_0 STRING
  VehPower_13_0 STRING
  VehPower_14_0 STRING
  VehPower_15_0 STRING
  VehGas_Diesel STRING
  VehGas_Regular STRING
  Region_R11 STRING
  Region_R21 STRING
  Region_R22 STRING
  Region_R23 STRING
  Region_R24 STRING
  Region_R25 STRING
  Region_R26 STRING
  Region_R31 STRING
  Region

Now let's wait for the data to appear in our offline store before moving forward to creating a dataset. This will take approximately 5 minutes.

In [41]:
account_id = boto3.client('sts').get_caller_identity()["Account"]
print(account_id)

insurance_policy_feature_group_s3_prefix = prefix + '/' + account_id + '/sagemaker/' + region + '/offline-store/' + insurance_policy_feature_group_name + '/data'
print(insurance_policy_feature_group_s3_prefix)

115272120974
sagemaker-featurestore-insurance/115272120974/sagemaker/us-east-2/offline-store/insurance-policy-feature-group-03-13-08-26/data


In [42]:
prefix

'sagemaker-featurestore-insurance'

In [43]:
default_s3_bucket_name

'sagemaker-us-east-2-115272120974'

In [None]:
s3_client = boto3.client('s3', region_name=region)
account_id = boto3.client('sts').get_caller_identity()["Account"]
print(account_id)

insurance_policy_feature_group_s3_prefix = prefix + '/' + account_id + '/sagemaker/' + region + '/offline-store/' + insurance_policy_feature_group_name + '/data'

offline_store_contents = None
while (offline_store_contents is None):
    objects_in_bucket = s3_client.list_objects(Bucket=default_s3_bucket_name,Prefix=insurance_policy_feature_group_s3_prefix)
    if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
        offline_store_contents = objects_in_bucket['Contents']
    else:
        print('Waiting for data in offline store...\n')
        sleep(60)
    
print('Data available.')

115272120974
Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...



SageMaker FeatureStore adds metadata for each record that's ingested into the offline store.

## Build Training Dataset

SageMaker FeatureStore automatically builds the Glue Data Catalog for FeatureGroups (you can optionally turn it on/off while creating the FeatureGroup). In this example, we want to create one training dataset with FeatureValues from both identity and transaction FeatureGroups. This is done by utilizing the auto-built Catalog. We run an Athena query that joins the data stored in the offline store in S3 from the 2 FeatureGroups. 

In [44]:
insurance_policy_query = insurance_policy_feature_group.athena_query()

insurance_policy_table = insurance_policy_query.table_name

query_string = 'SELECT * FROM "'+insurance_policy_table+'"' #+insurance_policy_table
print('Running ' + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
#dataset = pd.DataFrame()
insurance_policy_query.run(query_string=query_string, output_location='s3://'+default_s3_bucket_name+'/'+prefix+'/query_results/')
insurance_policy_query.wait()
dataset = insurance_policy_query.as_dataframe()

dataset

Running SELECT * FROM "insurance-policy-feature-group-03-13-08-26-1622725747"


Unnamed: 0,vehage_bin0_0_4_0,vehage_bin4_0_10_0,vehage_bin10_0_100_0,drivage_bin18_0_36_0,drivage_bin36_0_50_0,drivage_bin50_0_99_0,vehbrand_b1,vehbrand_b10,vehbrand_b11,vehbrand_b12,...,exposure,bonusmalus,claimamount,purepremium,frequency,avgclaimamount,eventtime,write_time,api_invocation_time,is_deleted
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.75,50.0,37.87,50.493333,1.333333,37.87,1.622726e+09,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.61,64.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.58,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.00,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.00,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.43,76.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:12:24.848,2021-06-03 13:12:12.000,False
59996,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.72,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:12:24.848,2021-06-03 13:12:13.000,False
59997,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.41,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:12:24.848,2021-06-03 13:12:13.000,False
59998,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.13,50.0,0.00,0.000000,0.000000,0.00,1.622726e+09,2021-06-03 13:12:24.848,2021-06-03 13:12:13.000,False


In [45]:
# Prepare query results for training.
query_execution = insurance_policy_query.get_query_execution()
query_result = 's3://'+default_s3_bucket_name+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'
print(query_result)

s3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/query_results/3502be6d-a383-4424-812b-91c5511ceb13.csv


In [46]:
!aws s3 ls s3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/query_results/

2021-05-28 00:47:32   31004270 27d95578-9dc5-4012-99ee-0e00f3a515ec.csv
2021-05-28 00:47:32       4317 27d95578-9dc5-4012-99ee-0e00f3a515ec.csv.metadata
2021-06-03 13:39:25   32879176 3502be6d-a383-4424-812b-91c5511ceb13.csv
2021-06-03 13:39:25       4317 3502be6d-a383-4424-812b-91c5511ceb13.csv.metadata
2021-05-28 15:59:46   31004288 c7b4e818-1ab9-411d-b2a6-ac936469e40d.csv
2021-05-28 15:59:47       4317 c7b4e818-1ab9-411d-b2a6-ac936469e40d.csv.metadata


In [47]:
df_features = pd.read_csv(query_result)

In [49]:
df_features.head()

Unnamed: 0,vehage_bin0_0_4_0,vehage_bin4_0_10_0,vehage_bin10_0_100_0,drivage_bin18_0_36_0,drivage_bin36_0_50_0,drivage_bin50_0_99_0,vehbrand_b1,vehbrand_b10,vehbrand_b11,vehbrand_b12,...,exposure,bonusmalus,claimamount,purepremium,frequency,avgclaimamount,eventtime,write_time,api_invocation_time,is_deleted
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.75,50.0,37.87,50.493333,1.333333,37.87,1622726000.0,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.61,64.0,0.0,0.0,0.0,0.0,1622726000.0,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.58,50.0,0.0,0.0,0.0,0.0,1622726000.0,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,50.0,0.0,0.0,0.0,0.0,1622726000.0,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,50.0,0.0,0.0,0.0,0.0,1622726000.0,2021-06-03 13:19:55.154,2021-06-03 13:14:54.000,False


In [50]:
df_features.columns = feature_names +['PurePremium','Frequency','AvgClaimAmount','eventtime','write_time','api_invocation_time','is_deleted']

In [51]:
s3_client = boto3.client('s3', region_name=region)
# Select useful columns for training with target column as the first.
dataset = df_features.iloc[:,np.r_[df_features.columns.get_loc('PurePremium'), 0:60]]

# Write to csv in S3 without headers and index column.
dataset.to_csv('dataset.csv', header=False, index=False)
s3_client.upload_file('dataset.csv', default_s3_bucket_name, prefix+'/training_input/dataset.csv')
dataset_uri_prefix = 's3://'+default_s3_bucket_name+'/'+prefix+'/training_input/';

dataset

Unnamed: 0,PurePremium,VehAge_bin0_0_4_0,VehAge_bin4_0_10_0,VehAge_bin10_0_100_0,DrivAge_bin18_0_36_0,DrivAge_bin36_0_50_0,DrivAge_bin50_0_99_0,VehBrand_B1,VehBrand_B10,VehBrand_B11,...,Region_R91,Region_R93,Region_R94,Area_A,Area_B,Area_C,Area_D,Area_E,Area_F,Density
0,50.493333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.045355
1,0.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.253593
2,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.310218
3,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.100507
4,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.972742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.233478
59996,0.000000,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.544094
59997,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.972742
59998,0.000000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.413101


In [52]:
dataset.shape

(60000, 61)

# Pure Premium Modeling

#### Pure Premium Modeling using xgboost

In [53]:
training_image=sagemaker.image_uris.retrieve("xgboost", region, "1.0-1")
training_image

'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3'

In [52]:
#!conda install -c conda-forge xgboost -y

In [54]:
training_output_path='s3://' + default_s3_bucket_name+'/'+prefix + '/training_output'

from sagemaker.estimator import Estimator
training_model = Estimator(training_image,
                           role, 
                           instance_count=1, 
                           instance_type='ml.m5.2xlarge',
                           volume_size = 5,
                           max_run = 7200,
                           input_mode= 'File',
                           output_path=training_output_path,
                           sagemaker_session=feature_store_session)

In [55]:
training_model.set_hyperparameters(objective = "reg:tweedie",
                                   num_round = 50)

In [56]:
train_data = sagemaker.inputs.TrainingInput(dataset_uri_prefix, distribution='FullyReplicated', 
                                            content_type='text/csv', s3_data_type='S3Prefix')
data_channels = {'train': train_data}

In [57]:
training_model.fit(inputs=data_channels, logs=True)

2021-06-03 13:43:07 Starting - Starting the training job...
2021-06-03 13:43:30 Starting - Launching requested ML instancesProfilerReport-1622727787: InProgress
......
2021-06-03 13:44:30 Starting - Preparing the instances for training......
2021-06-03 13:45:30 Downloading - Downloading input data
2021-06-03 13:45:30 Training - Downloading the training image...
2021-06-03 13:46:03 Uploading - Uploading generated training model[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:tweedie to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[13:45:59] 60000x60 matrix with 360000

## Set up Hosting for the Model

Once the training is done, we can deploy the trained model as an Amazon SageMaker real-time hosted endpoint. This will allow us to make predictions (or inference) from the model. Note that we don't have to host on the same instance (or type of instance) that we used to train. The endpoint deployment can be accomplished as follows. This takes 8-10 minutes to complete.

### Using existing model object in python

In [58]:
predictor = training_model.deploy(initial_instance_count = 1, instance_type = 'ml.m5.xlarge')

-------------!

### Using BYOM(Bring your own model) from a csv file in a bucket
We can also deploy an endpoint given an external model data(CSV in a bucket). 
1. Update the location of your model data in the model_path variable. 
2. Please note the EndpointName is going to be the deployed endpoint name we will use for inference
3. Make sure the endpoint config name is matching the config name provided in the create_endpoint API


#### Step 1 - Option1 - create a model from the trained model data

In [None]:
import boto3
from sagemaker import get_execution_role

client = boto3.client('sagemaker')
model_path = 's3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/training_output/sagemaker-xgboost-2021-06-03-13-43-07-175/output/model.tar.gz'
model_name = 'my-sagemaker-model'
role = get_execution_role()

create_model_api_response = client.create_model(ModelName=model_name,
                                    PrimaryContainer={
                                        'Image': '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',
                                        'ModelDataUrl': model_path
                                    },
                                    ExecutionRoleArn=role)
print(create_model_api_response)

#### Step 1 - Option2 - create a model from CSV Data

In [None]:
model_data = 's3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/training_input/dataset.csv'
#TODO: train a new model based on the CSV and get a training_model object

#### Step 2 - Create endpoint configuration

In [None]:
endpoint_config_name = 'sagemaker-neomxnet-endpoint-configuration'
create_endpoint_config_api_response = client.create_endpoint_config(
                                            EndpointConfigName=endpoint_config_name,
                                            ProductionVariants=[
                                                {
                                                    'VariantName': 'varian1',
                                                    'ModelName': model_name,
                                                    'InitialInstanceCount': 1,
                                                    'InstanceType': 'ml.m5.xlarge'
                                                }]
                                       )
print(create_endpoint_config_api_response)

#### Step 3 - Create the endpoint

In [73]:
endpoint_name = 'my-custom-endpoint-name'
create_endpoint_api_response = client.create_endpoint(
                                    EndpointName=endpoint_name,
                                    EndpointConfigName=endpoint_config_name
                                )
print ("create_endpoint API response", create_endpoint_api_response)  

## Using the endpoint to predict results
To use the endpoint and get prediction, we need to submit a web service call to the endpoint.
This can be done by using the python provided SDK or through a standard web service call.
In the example below, we call the endpoint created above and get the prediction for a random record in our dataset.


In [92]:
import boto3
import json
runtime = boto3.Session().client('sagemaker-runtime')
payload=['1.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.6971986019415635']

response = runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType='text/csv', Body=','.join(payload))

result = json.loads(response['Body'].read().decode())
print(result)

87.7158432006836
60


In [99]:
import pandas as pd
import boto3
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= 'sagemaker-us-east-2-115272120974', Key='sagemaker-featurestore-insurance/training_input/dataset.csv') 
records_df = pd.read_csv(obj['Body'])
random_record = records_df.sample().columns[range(60)]
#print(random_record.tolist())

['50.49333333333333', '0.0', '1.0', '0.0.1', '0.0.2', '0.0.3', '1.0.1', '0.0.4', '0.0.5', '0.0.6', '0.0.7', '1.0.2', '0.0.8', '0.0.9', '0.0.10', '0.0.11', '0.0.12', '0.0.13', '1.0.3', '0.0.14', '0.0.15', '0.0.16', '0.0.17', '0.0.18', '0.0.19', '0.0.20', '0.0.21', '0.0.22', '0.0.23', '0.0.24', '0.0.25', '1.0.4', '0.0.26', '0.0.27', '0.0.28', '0.0.29', '1.0.5', '0.0.30', '0.0.31', '0.0.32', '0.0.33', '0.0.34', '0.0.35', '0.0.36', '0.0.37', '0.0.38', '0.0.39', '0.0.40', '0.0.41', '0.0.42', '0.0.43', '0.0.44', '0.0.45', '0.0.46', '0.0.47', '0.0.48', '1.0.6', '0.0.49', '0.0.50', '0.0.51']


In [100]:
runtime = boto3.Session().client('sagemaker-runtime')
response = runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType='text/csv', Body=','.join(random_record.tolist()))
result = json.loads(response['Body'].read().decode())
print(result)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from model with message "Loading csv data failed with Exception, please ensure data is in csv format:
 <class 'ValueError'>
 could not convert string to float: '0.0.1'". See https://us-east-2.console.aws.amazon.com/cloudwatch/home?region=us-east-2#logEventViewer:group=/aws/sagemaker/Endpoints/my-custom-endpoint-name in account 115272120974 for more information.

#### 1- inference using estimator

In [69]:
# Incoming inference request.
df_transformed['IDpol'][0:100]

0       1
1       3
2       5
3      10
4      11
     ... 
95    194
96    195
97    196
98    197
99    198
Name: IDpol, Length: 100, dtype: object

In [76]:
inference_request=[]

test_y=[]
for IDpol in df_transformed['IDpol'][0:100]:
    #print(IDpol)
    IDpol = str(IDpol)
    transaction_response = featurestore_runtime.get_record(FeatureGroupName=insurance_policy_feature_group_name, RecordIdentifierValueAsString=IDpol)
    transaction_record = transaction_response['Record']
    testrecord = []
    column_names=[]
    #print(transaction_record)
    for f in transaction_record:
        testrecord.append(f['ValueAsString'])
        column_names.append(f['FeatureName'])
        if f['FeatureName'] == 'PurePremium':
            test_y.append(float(f['ValueAsString']))
    #print(testrecord[0:60])
    inference_request.append(testrecord[0:60])
print(inference_request[7])

['1.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '-0.49434435026400536']


In [60]:
y_pred_xgboost=[]
import json
for inf_rec in inference_request:
    #print(inf_rec)
    results = predictor.predict(','.join(inf_rec), initial_args = {"ContentType": "text/csv"})
    y_pred_xgboost.append(json.loads(results))
print(y_pred_xgboost)

[55.76900863647461, 55.76900863647461, 77.2425537109375, 61.05708694458008, 61.05708694458008, 140.96527099609375, 140.96527099609375, 34.81033706665039, 34.81033706665039, 69.87076568603516, 69.87076568603516, 42.15306091308594, 17.21091651916504, 17.21091651916504, 71.26097106933594, 2.3110408782958984, 27.887907028198242, 27.887907028198242, 12.534825325012207, 12.534825325012207, 12.534825325012207, 98.09772491455078, 98.09772491455078, 591.037109375, 1.4619827270507812, 1.4619827270507812, 1.4619827270507812, 72.86361694335938, 72.86361694335938, 72.86361694335938, 45.2734489440918, 33.839874267578125, 12.074952125549316, 12.074952125549316, 17.871440887451172, 17.871440887451172, 1.8744076490402222, 1.8744076490402222, 2.207972764968872, 2.207972764968872, 40.82759475708008, 40.82759475708008, 6.499734401702881, 6.499734401702881, 7.19212007522583, 7.19212007522583, 74.86333465576172, 74.86333465576172, 79.13360595703125, 69.67892456054688, 69.67892456054688, 1.6788733005523682, 

In [102]:
from sklearn.metrics import mean_squared_error

mean_squared_error(test_y, y_pred_xgboost)

1792635.451253769

#### 2- inference using endpoint

https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/sagemaker-xgboost-2021-02-14-05-22-25-014/invocations

In [137]:
len(inference_request)

100

In [62]:
import json 
import boto3 
client = boto3.client('runtime.sagemaker')
for inf_rec in inference_request:
    #print(inf_rec)
    data = ','.join(inf_rec)

    response = client.invoke_endpoint(EndpointName='sagemaker-xgboost-2021-05-28-16-03-39-703', Body=data, ContentType='text/csv')
    response_body = response['Body'] 
    print(response_body.read())

b'55.76900863647461'
b'55.76900863647461'
b'77.2425537109375'
b'61.05708694458008'
b'61.05708694458008'
b'140.96527099609375'
b'140.96527099609375'
b'34.81033706665039'
b'34.81033706665039'
b'69.87076568603516'
b'69.87076568603516'
b'42.15306091308594'
b'17.21091651916504'
b'17.21091651916504'
b'71.26097106933594'
b'2.3110408782958984'
b'27.887907028198242'
b'27.887907028198242'
b'12.534825325012207'
b'12.534825325012207'
b'12.534825325012207'
b'98.09772491455078'
b'98.09772491455078'
b'591.037109375'
b'1.4619827270507812'
b'1.4619827270507812'
b'1.4619827270507812'
b'72.86361694335938'
b'72.86361694335938'
b'72.86361694335938'
b'45.2734489440918'
b'33.839874267578125'
b'12.074952125549316'
b'12.074952125549316'
b'17.871440887451172'
b'17.871440887451172'
b'1.8744076490402222'
b'1.8744076490402222'
b'2.207972764968872'
b'2.207972764968872'
b'40.82759475708008'
b'40.82759475708008'
b'6.499734401702881'
b'6.499734401702881'
b'7.19212007522583'
b'7.19212007522583'
b'74.86333465576172'
b'7

In [63]:
data

'1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.6306555534609535'

In [64]:
account_info = boto3.client('sts').get_caller_identity()
print(account_info)

{'UserId': 'AROARVVWA7KHK4IVCCZPP:SageMaker', 'Account': '115272120974', 'Arn': 'arn:aws:sts::115272120974:assumed-role/AmazonSageMaker-ExecutionRole-20210503T135106/SageMaker', 'ResponseMetadata': {'RequestId': 'a5078a00-6895-491b-a6f0-f478d2902780', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'a5078a00-6895-491b-a6f0-f478d2902780', 'content-type': 'text/xml', 'content-length': '470', 'date': 'Fri, 28 May 2021 16:46:03 GMT'}, 'RetryAttempts': 0}}


#### 3- inference using local model artifact

In [65]:
import tarfile
import pickle as pkl

# download the model artifact from AWS S3
!aws s3 cp $training_model.model_data .

#opens the downloaded model artifcat and loads it as 'model' variable
tar = tarfile.open('model.tar.gz')
tar.extractall()
tar.close()


download: s3://sagemaker-us-east-2-115272120974/sagemaker-featurestore-insurance/training_output/sagemaker-xgboost-2021-05-28-16-00-27-530/output/model.tar.gz to ./model.tar.gz


In [None]:
import sagemaker
input_data_path = 's3://{}/{}/{}'.format(default_s3_bucket_name, 'sagemaker-featurestore-insurance/training_output/', 'file_name')
output_data_path = 's3://{}/{}'.format(default_s3_bucket_name, 'sagemaker-featurestore-insurance/batch')
transform_job = sagemaker.transformer.Transformer(
    model_name = model_name,
    instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    strategy = 'SingleRecord',
    assemble_with = 'Line',
    output_path = output_data_path,
    base_transform_job_name='inference-pipelines-batch',
    sagemaker_session=sagemaker.Session(),
    accept = CONTENT_TYPE_CSV)
transform_job.transform(data = input_data_path, 
                        content_type = CONTENT_TYPE_CSV, 
                        split_type = 'Line')

In [69]:
default_s3_bucket_name

'sagemaker-us-east-2-115272120974'

In [69]:
import xgboost
model = pkl.load(open('xgboost-model', 'rb'))

In [117]:
map_names = dict(zip(model.feature_names, column_names[0:60]))
model.feature_names = list(map_names.values())

In [118]:
#model.get_score(importance_type='weight')

In [120]:
#test_ds = X_train[0:100]
test_ds = pd.DataFrame(data=inference_request, columns=column_names[0:60])
for col in test_ds.columns:
    test_ds[col] = pd.to_numeric(test_ds[col])
from xgboost import DMatrix

y_pred_local_xgboost=model.predict(DMatrix(test_ds),output_margin=True)

In [121]:
mean_squared_error(test_y, y_pred_xgboost)

1792635.451253769