In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sdv.single_table import TVAESynthesizer
from sdv.metadata import Metadata
from collections import Counter
from sdv.evaluation.single_table import evaluate_quality
from sklearn.ensemble import RandomForestClassifier
from sdv.sampling import Condition
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
)

In [81]:
# Load data
df = pd.read_csv(
    "../../data/Angoss Knowledge Seeker - carclaims.txt/carclaims_original.csv"
)
# Drop row with missing data
df.drop(df[df["DayOfWeekClaimed"] == "0"].index, inplace=True)
# Drop ID column
df.drop(columns="PolicyNumber", inplace=True)

# Train-test split
carclaims_train, carclaims_test = train_test_split(df, test_size=0.2, random_state=141)

# Load SDV metadata
metadata = Metadata.load_from_json(filepath="carclaims_metadata.json")

In [None]:
print('Original dataset shape %s' % Counter(carclaims_train['FraudFound']))
rus = RandomUnderSampler(random_state=42)    
X_rus, y_rus = rus.fit_resample(carclaims_train.drop('FraudFound', axis=1), carclaims_train['FraudFound'])

print('Undersampled dataset shape %s' % Counter(y_rus))
# Create synthesizer
synthesizer = TVAESynthesizer(
    metadata, cuda=True,
    epochs=30000,  # 300
    compress_dims=[256, 256],  # (256, 256)
    decompress_dims=[256, 256],  # (256, 256),
    embedding_dim=512,  # 512
)
synthesizer.fit(pd.concat([X_rus, y_rus], axis=1))

synthesizer.save(filepath='my_synthesizer_3.pkl')

synthetic_data = synthesizer.sample(
    num_rows=10_000,
    batch_size=1_000
)

quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=metadata)

Original dataset shape Counter({'No': 11597, 'Yes': 738})
Undersampled dataset shape Counter({'No': 738, 'Yes': 738})


Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 12155.89it/s]


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 32/32 [00:00<00:00, 329.17it/s]|
Column Shapes Score: 92.46%

(2/2) Evaluating Column Pair Trends: |██████████| 496/496 [00:02<00:00, 211.48it/s]|
Column Pair Trends Score: 87.03%

Overall Score (Average): 89.75%



20000 epochs + OrdinalEncoder

Accuracy: 0.7455 Precision: 0.1411 Recall: 0.6378 F1 Score: 0.2311

2000 epochs + OrdinalEncoder

Accuracy: 0.8100 Precision: 0.1607 Recall: 0.5135 F1 Score: 0.2448

In [4]:
train_and_predict(
    tvae_epochs=20000, # 300
    tvae_compress_depth=5, # 2
    tvae_compress_width=64, # 128
    tvae_decompress_depth=5, # 2
    tvae_decompress_width=512, # 128
    tvae_embedding_dim=32, # 128
    rf_n_estimators=500,
    rf_criterion='gini',
    rf_max_depth=21,
    rf_min_samples_split=13,
    rf_min_samples_leaf=2,
)

Checking: tvae_epochs 20000, tvae_batch_size default, tvae_compress_depth 5, tvae_compress_width 64, tvae_decompress_depth 5, tvae_decompress_width 512, tvae_embedding_dim 32, rf_n_estimators 500, rf_criterion gini, rf_max_depth 21, rf_min_samples_split 13, rf_min_samples_leaf 2,
Original dataset shape Counter({'No': 11597, 'Yes': 738})
Undersampled dataset shape Counter({'No': 738, 'Yes': 738})


Sampling conditions: : 11000it [00:01, 5566.21it/s]                         


Balanced dataset shape Counter({'No': 11597, 'Yes': 11597})
Accuracy: 0.9176 Precision: 0.1584 Recall: 0.0865 F1 Score: 0.1119


Checking: tvae_epochs 30000, tvae_batch_size default, tvae_compress_depth 5, tvae_compress_width 512, tvae_decompress_depth 5, tvae_decompress_width 512, tvae_embedding_dim 512, rf_n_estimators 115, rf_criterion gini, rf_max_depth 10, rf_min_samples_split 30, rf_min_samples_leaf 16,

Original dataset shape Counter({'No': 11597, 'Yes': 738})

Undersampled dataset shape Counter({'No': 738, 'Yes': 738})

Sampling conditions: : 51000it [00:12, 4123.75it/s]                         

Balanced dataset shape Counter({'No': 31597, 'Yes': 31597})

Accuracy: 0.6530 Precision: 0.1352 Recall: 0.8865 F1 Score: 0.2346

Checking: tvae_epochs 30000, tvae_batch_size default, tvae_compress_depth 5, tvae_compress_width 512, tvae_decompress_depth 5, tvae_decompress_width 512, tvae_embedding_dim 512, rf_n_estimators 115, rf_criterion gini, rf_max_depth 10, rf_min_samples_split 30, rf_min_samples_leaf 16,

Original dataset shape Counter({'No': 11597, 'Yes': 738})

Undersampled dataset shape Counter({'No': 738, 'Yes': 738})

Sampling conditions: : 11000it [00:02, 4874.63it/s]                         

Balanced dataset shape Counter({'No': 11597, 'Yes': 11597})

Accuracy: 0.6702 Precision: 0.1408 Recall: 0.8811 F1 Score: 0.2427

In [82]:
synthesizer = TVAESynthesizer.load(
    filepath='my_synthesizer_92_8.pkl'
)

print(synthesizer.get_parameters())
major_cnt = carclaims_train['FraudFound'].value_counts()['No']
minor_cnt = carclaims_train['FraudFound'].value_counts()['Yes']

balance_cnt = major_cnt - minor_cnt
# Conditions for balancing the data
fraud_samples = Condition(
    # num_rows=20_000 + balance_cnt,
    num_rows=balance_cnt,
    column_values={'FraudFound': 'Yes'}
)
# non_fraud_samples = Condition(
#     num_rows=20_000,
#     column_values={'FraudFound': 'No'}
# )
# Create balanced synthetic data
synthetic_data = synthesizer.sample_from_conditions(
    conditions=[fraud_samples],
    # conditions=[fraud_samples, non_fraud_samples],
    batch_size=1_000
)

balanced_data = pd.concat([carclaims_train, synthetic_data], axis=0).reset_index(drop=True)
carclaims_test.reset_index(drop=True)
print('Balanced dataset shape %s' % Counter(balanced_data['FraudFound']))

# X y split
X_train = balanced_data.drop('FraudFound', axis=1)
y_train = balanced_data['FraudFound']
X_test = carclaims_test.drop('FraudFound', axis=1)
y_test = carclaims_test['FraudFound']
# Encode target variable
y_train = y_train.map({'Yes': 1, 'No': 0})
y_test = y_test.map({'Yes': 1, 'No': 0})
y_train = y_train.astype(int)
y_test = y_test.astype(int)
# Lebel Encode features
column_labels = {
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'DayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'DayOfWeekClaimed': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'MonthClaimed': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
    'AgeOfPolicyHolder': ['16 to 17', '18 to 20', '21 to 25', '26 to 30', '31 to 35', '36 to 40', '41 to 50', '51 to 65', 'over 65'],
    'NumberOfSuppliments': ['none', '1 to 2', '3 to 5', 'more than 5'],
    'AddressChange-Claim': ['no change', 'under 6 months', '1 year', '2 to 3 years', '4 to 8 years'],
    'NumberOfCars': ['1 vehicle', '2 vehicles', '3 to 4', '5 to 8', 'more than 8'],
    'VehiclePrice': ['less than 20,000', '20,000 to 29,000', '30,000 to 39,000', '40,000 to 59,000', '60,000 to 69,000', 'more than 69,000'],
    'Days:Policy-Accident': ['none', '1 to 7', '15 to 30', '8 to 15', 'more than 30'],
    'Days:Policy-Claim': ['15 to 30', '8 to 15', 'more than 30'],
    'PastNumberOfClaims': ['none', '1', '2 to 4', 'more than 4'],
    'AgeOfVehicle': ['new', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', 'more than 7'],
    'Make': ['Accura', 'BMW', 'Chevrolet', 'Dodge', 'Ferrari', 'Ford', 'Honda', 'Jaguar', 'Lexus', 'Mazda', 'Mecedes', 'Mercury', 'Nisson', 'Pontiac', 'Porche', 'Saab', 'Saturn', 'Toyota', 'VW']

    }
for column, labels in column_labels.items():
    oe = OrdinalEncoder(categories=[labels], handle_unknown='error')
    X_train[column] = oe.fit_transform(X_train[[column]])
    X_test[column] = oe.transform(X_test[[column]])
# one hot encode
columns_one_hot = {
    'AccidentArea': ['Rural', 'Urban'],
    'Sex': ['Female', 'Male'],
    'MaritalStatus': ['Divorced', 'Married', 'Single', 'Widow'],
    'PoliceReportFiled': ['No', 'Yes'],
    'WitnessPresent': ['No', 'Yes'],
    'AgentType': ['External', 'Internal'],
    'BasePolicy': ['All Perils', 'Collision', 'Liability'],
    'Fault': ['Policy Holder', 'Third Party'],
    'PolicyType': ['Sedan - All Perils', 'Sedan - Collision', 'Sedan - Liability','Sport - All Perils', 'Sport - Collision', 'Sport - Liability', 'Utility - All Perils', 'Utility - Collision', 'Utility - Liability'],
    'VehicleCategory': ['Sedan', 'Sport', 'Utility'],
    
}
for column, labels in columns_one_hot.items():
    ohe = OneHotEncoder(sparse_output=False, categories=[labels], drop='first', handle_unknown='error')
    encoded_nominal = ohe.fit_transform(X_train[[column]])
    X_train = pd.concat([X_train, pd.DataFrame(encoded_nominal, columns=ohe.get_feature_names_out([column]), index=X_train.index)], axis=1)
    encoded_nominal = ohe.transform(X_test[[column]])
    X_test = pd.concat([X_test, pd.DataFrame(encoded_nominal, columns=ohe.get_feature_names_out([column]), index=X_test.index)], axis=1)
X_test.drop(columns=columns_one_hot.keys(), axis=1, inplace=True)
X_train.drop(columns=columns_one_hot.keys(), axis=1, inplace=True)


{'enforce_min_max_values': True, 'enforce_rounding': True, 'embedding_dim': 512, 'compress_dims': [256, 256, 256, 256, 256], 'decompress_dims': [1024, 1024, 1024, 1024, 1024], 'l2scale': 1e-05, 'batch_size': 500, 'verbose': False, 'epochs': 20000, 'loss_factor': 2, 'cuda': True}


Sampling conditions: : 11000it [00:05, 2086.90it/s]                         


Balanced dataset shape Counter({'No': 11597, 'Yes': 11597})


In [57]:
rf_classifier = RandomForestClassifier(
    n_estimators=200,
    # criterion='entropy',
    # max_depth=32,
    # min_samples_split=rf_min_samples_split,
    # min_samples_leaf=rf_min_samples_leaf,
    random_state=42,
)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f} Precision: {precision:.4f} Recall: {recall:.4f} F1 Score: {f1:.4f}")

Accuracy: 0.8797 Precision: 0.2057 Recall: 0.3514 F1 Score: 0.2595


synthesizer = TVAESynthesizer(
    metadata, cuda=True,
    epochs=20000,  # 300
    compress_dims=[1024, 512, 256, 64],  # (256, 256)
    decompress_dims=[64, 256, 512, 1024],  # (256, 256),
    embedding_dim=64,  # 128
)
synthesizer.fit(pd.concat([X_rus, y_rus], axis=1))

synthesizer.save(filepath='my_synthesizer_2.pkl')

256 256 256, 1024 1024 1024 Overall Score (Average): 90.94%

In [None]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV

# xgb = XGBClassifier(
#     # n_estimators=2, 
#     # max_depth=10, 
#     # learning_rate=1, 
#     # objective="binary:logistic"
# )
def scorer(estimator, X, y):
    return f1_score(y_test, estimator.predict(X_test))
    
opt = BayesSearchCV(
    XGBClassifier(),
    {
        'n_estimators': (2, 200),
        'max_depth': (2, 20),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'reg_lambda': (1, 8, 'uniform'), 
        # 'sampling_method': ['uniform', 'gradient_based'],
        'booster': ['gbtree'],
        'learning_rate': (0.001, 1, 'uniform'),
        'subsample': (0.5, 1, 'uniform')
    },
    n_iter=100,
    cv=5,
    scoring=scorer
)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
# xgb.fit(X_train, y_train)
# y_pred = xgb.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.4f} Precision: {precision:.4f} Recall: {recall:.4f} F1 Score: {f1:.4f}")

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not used.

Parameters: { "gamma", "max_depth", "tree_method" } are not us

val. score: 0.3000105922468458
test score: 0.28703703703703703


In [76]:
opt.best_params_

OrderedDict([('booster', 'gbtree'),
             ('gamma', 6.3588020322349665),
             ('max_depth', 35),
             ('n_estimators', 199),
             ('reg_lambda', 8),
             ('tree_method', 'exact')])

In [79]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV

# xgb = XGBClassifier(
#     n_estimators=15, 
#     gamma=2e-6,
#     max_depth=33, 
#     # learning_rate=1,
#     booster='gbtree',
#     reg_lambda=8,
#     objective="binary:logistic"
# )
xgb = XGBClassifier(
    n_estimators=199, 
    gamma=2e-6,
    max_depth=35, 
    # learning_rate=1,
    booster='gbtree',
    reg_lambda=8,
    tree_method='exact'
    # objective="binary:logistic"
)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f} Precision: {precision:.4f} Recall: {recall:.4f} F1 Score: {f1:.4f}")

Accuracy: 0.8979 Precision: 0.2441 Recall: 0.3351 F1 Score: 0.2825


In [55]:
synthesizer = TVAESynthesizer.load(
    filepath='my_synthesizer_3.pkl'
)

synthetic_data = synthesizer.sample(
    num_rows=10_000,
    batch_size=1_000
)

quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=metadata)

Sampling rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 12976.76it/s]


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 32/32 [00:00<00:00, 332.09it/s]|
Column Shapes Score: 92.46%

(2/2) Evaluating Column Pair Trends: |██████████| 496/496 [00:02<00:00, 208.76it/s]|
Column Pair Trends Score: 87.03%

Overall Score (Average): 89.75%

