In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *

In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:
vc_categorical_cols = ['Maker', 'Reg_year', ' Genmodel', 'Color', 'Bodytype', 'Engin_size', 'Gearbox', 'Fuel_type',
                        'Seat_num', 'Door_num', 'issue', 'issue_id', 'repair_complexity']
vi_categorical_cols = ['Make', 'AccidentArea',	'Sex',	'MaritalStatus', 'Age',	'Fault', 'PolicyType',
                    	'VehicleCategory',	'Deductible',	'Days_Policy_Accident',	'Days_Policy_Claim',
            	        'AgeOfVehicle',  'AgeOfPolicyHolder', 'PoliceReportFiled',	'WitnessPresent',
                	   'AgentType',	'NumberOfSuppliments',	'AddressChange_Claim', 'VehiclePrice',
                       'PastNumberOfClaims', 'NumberOfCars', 'BasePolicy', 'Month', 'MonthClaimed',
                       'DayOfWeek', 'DayOfWeekClaimed']

ci_categorical_cols = ['policy_state', 'umbrella_limit', 'insured_sex', 'insured_education_level',
    	'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type',
        'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',	
        'property_damage', 'police_report_available', 'auto_make', 'auto_model']
ic_categorical_cols = ['ps_ind_02_cat','ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat']

vc_cols_to_remove = [' Genmodel_ID', 'Adv_year', 'Adv_month', 'Adv_day', 'breakdown_date', 'repair_date', 'category_anomaly']
ci_cols_to_remove = ['policy_number', 'policy_bind_date', 'policy_csl', 'incident_location', 'incident_date', '_c39']
ic_cols_to_remove = ['id', 'target']



**Isolation Forest** 

In [4]:
def isolation_forest(train_data, train_label, test_data, test_label, n_estimators = 400):
    clf = IsolationForest(random_state=0, n_estimators=n_estimators).fit(train_data)
    result = clf.predict(test_data)
    result = pd.DataFrame(result)
    result.replace(to_replace=1, value=0, inplace=True)
    result.replace(to_replace=-1, value=1, inplace=True)
    p, r, f, a = get_scores(result, test_label)
    return p,r,f,a
    

**LOF**

In [5]:
def local_outlier_factor(train_data, train_label, test_data, test_label, n_neighbors = 100):
    clf = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True).fit(train_data)
    result = clf.predict(test_data)
    result = pd.DataFrame(result)
    result.replace(to_replace=1, value=0, inplace=True)
    result.replace(to_replace=-1, value=1, inplace=True)
    p, r, f, a = get_scores(result, test_label)
    return p,r,f,a

**One Class SVM**

In [6]:
def one_class_svm(train_data, train_label, test_data, test_label, kernel = 'rbf'):
    clf = OneClassSVM(kernel=kernel, gamma='auto', verbose=True).fit(train_data)
    result = clf.predict(test_data)
    result = pd.DataFrame(result)
    result.replace(to_replace=1, value=0, inplace=True)
    result.replace(to_replace=-1, value=1, inplace=True)
    p, r, f, a = get_scores(result, test_label)
    return p,r,f,a

**Gradient Boosting**

In [7]:
def gradient_boosting(train_data, train_label, test_data, test_label, n_estimators = 100):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
            max_depth=1, random_state=0).fit(train_data, train_label)
    result = clf.predict(test_data)
    result = pd.DataFrame(result)
    p, r, f, a = get_scores(result, test_label)
    return p,r,f,a

**Random Forest**

In [8]:
def random_forest(train_data, train_label, test_data, test_label, max_depth = 2):
    clf = RandomForestClassifier(max_depth=max_depth, random_state=0)
    clf.fit(train_data, train_label)
    result = clf.predict(test_data)
    result = pd.DataFrame(result)
    p, r, f, a = get_scores(result, test_label)
    return p,r,f,a

**Label Encoding**

In [4]:
data_vc = load_data('../../data/vehicle_claims/train_label.csv')
data_vi = load_data('../../data/vehicle_insurance/train_label.csv')
data_ci = load_data('../../data/car_insurance/train_label.csv')
data_ic = load_data('../../data/insurance_claim_kaggle/train_label.csv')

In [5]:
data_ci_test = load_data('../../data/car_insurance/test_label.csv')
data_vi_test = load_data('../../data/vehicle_insurance/test_label.csv')
data_vc_test = load_data('../../data/vehicle_claims/test_label.csv')
data_ic_test = load_data('../../data/insurance_claim_kaggle/test_label.csv')

In [6]:
label_vc = load_data('../../data/vehicle_claims/train_label_Y.csv')
label_vi = load_data('../../data/vehicle_insurance/train_label_Y.csv')
label_ci = load_data('../../data/car_insurance/train_label_Y.csv')
label_ic = load_data('../../data/insurance_claim_kaggle/train_label_Y.csv')

In [7]:
label_vc_test = load_data('../../data/vehicle_claims/test_label_Y.csv')
label_vi_test = load_data('../../data/vehicle_insurance/test_label_Y.csv')
label_ci_test = load_data('../../data/car_insurance/test_label_Y.csv')
label_ic_test = load_data('../../data/insurance_claim_kaggle/test_label_Y.csv')

In [62]:
clf = IsolationForest(random_state=0, n_estimators=400).fit(data_ci)

In [70]:
score = clf.score_samples(data_ci_test)

In [71]:
thresh = np.quantile(score, 0.825)
print(thresh)

-0.5076192066824403


In [73]:
auc = metrics.roc_auc_score(label_ci_test['Label'], score)

In [65]:
y = [0 if scores > thresh else 1 for scores in score]

In [68]:
p, r, f, a = get_scores(label_ci_test['Label'], y)

In [69]:
p, r, f, a

(0.6257938023578541,
 0.3566666666666667,
 0.3241724636220048,
 0.5129860209304102)

In [59]:
fpr, tpr, thresholds = metrics.roc_curve(y, label_ci_test['Label'], pos_label=1)
auc = metrics.roc_auc_score(y, label_ci_test['Label'])

In [60]:
auc

0.5129860209304102

In [50]:
result = clf.predict(data_ci_test)
result = pd.DataFrame(result)
result.replace(to_replace=1, value=0, inplace=True)
result.replace(to_replace=-1, value=1, inplace=True)
p, r, f, a = get_scores(result,label_ci_test['Label'] )

In [51]:
p,r,f,a

(0.8645227272727273, 0.32, 0.4018023976849938, 0.5164772727272727)

In [52]:
fpr, tpr, thresholds = metrics.roc_curve(result, label_ci_test['Label'], pos_label=1)
auc = metrics.auc(fpr, tpr)

In [53]:
auc

0.540707467714767

In [141]:
isof = isolation_forest(data_ci, label_ci, data_ci_test, label_ci_test, 1000)
lof = local_outlier_factor(data_ci, label_ci, data_ci_test, label_ci_test, 2)
ocsvm = one_class_svm(data_ci, label_ci, data_ci_test, label_ci_test, 'linear')
gb = gradient_boosting(data_ci, label_ci, data_ci_test, label_ci_test, 400)
rf = random_forest(data_ci, label_ci, data_ci_test, label_ci_test, 200)

  y = column_or_1d(y, warn=True)


[LibSVM]

  clf.fit(train_data, train_label)


In [142]:
ci_label_encoding = [isof, lof, ocsvm, gb, rf]

In [143]:
isof = isolation_forest(data_vi, label_vi,data_vi_test, label_vi_test, 1000)
lof = local_outlier_factor(data_vi, label_vi,data_vi_test, label_vi_test, 3)
ocsvm = one_class_svm(data_vi, label_vi,data_vi_test, label_vi_test, 'linear')
gb = gradient_boosting(data_vi, label_vi,data_vi_test, label_vi_test, 400)
rf = random_forest(data_vi, label_vi, data_vi_test, label_vi_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [144]:
vi_label_encoding = [isof, lof, ocsvm, gb, rf]

In [145]:
ci_label_encoding

[(0.8434090909090909,
  0.32666666666666666,
  0.39977142857142856,
  0.5170454545454546),
 (0.8403636363636363, 0.68, 0.7480893300248139, 0.47954545454545455),
 (0.43100000000000005, 0.46, 0.4291599020125847, 0.42500000000000004),
 (0.7980454545454545,
  0.7533333333333333,
  0.7703824226506951,
  0.6329545454545454),
 (0.8389090909090909, 0.72, 0.7662295081967213, 0.5465909090909091)]

In [146]:
vi_label_encoding

[(0.669522600957939,
  0.7518374405533939,
  0.6936515966073963,
  0.5219001200305532),
 (0.9874037504087234,
  0.9332036316472114,
  0.959287304379996,
  0.5021532753791874),
 (0.5229450930157767,
  0.5045395590142672,
  0.38677509518644176,
  0.522908861649632),
 (0.9957771168593704,
  0.9366623432771293,
  0.9652118283303335,
  0.5007177584597291),
 (0.9997845890125227,
  0.9386078685689581,
  0.9681169719246988,
  0.5017543859649123)]

In [196]:
isof = isolation_forest(data_vc, label_vc,data_vc_test, label_vc_test, 1000)
lof = local_outlier_factor(data_vc, label_vc,data_vc_test, label_vc_test, 3)
ocsvm = one_class_svm(data_vc, label_vc,data_vc_test, label_vc_test, 'linear')
gb = gradient_boosting(data_vc, label_vc,data_vc_test, label_vc_test, 400)
rf = random_forest(data_vc, label_vc, data_vc_test, label_vc_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [197]:
vc_label_encoding = [isof, lof, ocsvm, gb, rf]

In [198]:
vc_label_encoding

[(0.7711953903529698,
  0.7517054562173043,
  0.7605439335538443,
  0.5963912813609704),
 (0.8754488838195098,
  0.7653491059557389,
  0.8113900611525643,
  0.5309201685209731),
 (0.5197466467461956,
  0.511239236055022,
  0.4669443664608268,
  0.5196522689042293),
 (0.9717205887206912,
  0.967705058588168,
  0.9685833253204285,
  0.9269682158741501),
 (0.9970224692691712,
  0.996980503746412,
  0.9969883431368645,
  0.9929030879022537)]

In [147]:
isof = isolation_forest(data_ic, label_ic,data_ic_test, label_ic_test, 1000)
lof = local_outlier_factor(data_ic, label_ic,data_ic_test, label_ic_test, 3)
ocsvm = one_class_svm(data_ic, label_ic,data_ic_test, label_ic_test, 'linear')
gb = gradient_boosting(data_ic, label_ic,data_ic_test, label_ic_test, 400)
rf = random_forest(data_ic, label_ic, data_ic_test, label_ic_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [148]:
ic_label_encoding = [isof, lof, ocsvm, gb, rf]

In [149]:
ic_label_encoding

[(0.6728455330416216,
  0.7277576274802645,
  0.6899885847425222,
  0.5515382561802552),
 (0.9925772217798615,
  0.885214422871773,
  0.9355732927865607,
  0.5003801591115383),
 (0.5121895718005051,
  0.5051561055401464,
  0.4174329722673293,
  0.5121966697939787),
 (0.998287409447594,
  0.8877035772704645,
  0.9395590589564005,
  0.5012268979352112),
 (0.9996976059883607,
  0.8878458146646754,
  0.9404576612070575,
  0.4999199102995355)]

**One Hot Encoding**

In [9]:
data_vc = load_data('../../data/vehicle_claims/train_OH.csv')
data_vi = load_data('../../data/vehicle_insurance/train_OH.csv')
data_ci = load_data('../../data/car_insurance/train_OH.csv')
data_ic = load_data('../../data/insurance_claim_kaggle/train_OH.csv')

In [10]:
data_ci_test = load_data('../../data/car_insurance/test_OH.csv')
data_vi_test = load_data('../../data/vehicle_insurance/test_OH.csv')
data_vc_test = load_data('../../data/vehicle_claims/test_OH.csv')
data_ic_test = load_data('../../data/insurance_claim_kaggle/test_OH.csv')

In [11]:
label_vc = load_data('../../data/vehicle_claims/train_OH_Y.csv')
label_vi = load_data('../../data/vehicle_insurance/train_OH_Y.csv')
label_ci = load_data('../../data/car_insurance/train_OH_Y.csv')
label_ic = load_data('../../data/insurance_claim_kaggle/train_OH_Y.csv')

In [12]:
label_vc_test = load_data('../../data/vehicle_claims/test_OH_Y.csv')
label_vi_test = load_data('../../data/vehicle_insurance/test_OH_Y.csv')
label_ci_test = load_data('../../data/car_insurance/test_OH_Y.csv')
label_ic_test = load_data('../../data/insurance_claim_kaggle/test_OH_Y.csv')

In [156]:
isof = isolation_forest(data_ci, label_ci, data_ci_test, label_ci_test, 1000)
lof = local_outlier_factor(data_ci, label_ci, data_ci_test, label_ci_test, 2)
ocsvm = one_class_svm(data_ci, label_ci, data_ci_test, label_ci_test, 'linear')
gb = gradient_boosting(data_ci, label_ci, data_ci_test, label_ci_test, 400)
rf = random_forest(data_ci, label_ci, data_ci_test, label_ci_test, 200)

  y = column_or_1d(y, warn=True)


[LibSVM]

  clf.fit(train_data, train_label)


In [157]:
ci_one_hot_encoding = [isof, lof, ocsvm, gb, rf]

In [158]:
ci_one_hot_encoding

[(1.0, 0.7333333333333333, 0.846153846153846, 0.5),
 (0.8403636363636363, 0.68, 0.7480893300248139, 0.47954545454545455),
 (0.43100000000000005, 0.46, 0.4291599020125847, 0.42500000000000004),
 (0.7751136363636364, 0.75, 0.7602102492693302, 0.646590909090909),
 (0.808125, 0.7366666666666667, 0.7638315789473684, 0.5937500000000001)]

In [159]:
isof = isolation_forest(data_vi, label_vi,data_vi_test, label_vi_test, 1000)
lof = local_outlier_factor(data_vi, label_vi,data_vi_test, label_vi_test, 3)
ocsvm = one_class_svm(data_vi, label_vi,data_vi_test, label_vi_test, 'linear')
gb = gradient_boosting(data_vi, label_vi,data_vi_test, label_vi_test, 400)
rf = random_forest(data_vi, label_vi, data_vi_test, label_vi_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [160]:
vi_one_hot_encoding = [isof, lof, ocsvm, gb, rf]

In [161]:
vi_one_hot_encoding

[(1.0, 0.9383916990920882, 0.9682167949146872, 0.5),
 (0.999553518652473,
  0.9381755296152183,
  0.967892443557971,
  0.4998848191660908),
 (0.5229450930157767,
  0.5045395590142672,
  0.38677509518644176,
  0.522908861649632),
 (0.9962219931657642,
  0.9368785127539991,
  0.9655345652768497,
  0.5008329392936384),
 (0.9997845890125227,
  0.9386078685689581,
  0.9681169719246988,
  0.5017543859649123)]

In [203]:
isof = isolation_forest(data_vc, label_vc,data_vc_test, label_vc_test, 1000)
lof = local_outlier_factor(data_vc, label_vc,data_vc_test, label_vc_test, 3)
ocsvm = one_class_svm(data_vc, label_vc,data_vc_test, label_vc_test, 'linear')
gb = gradient_boosting(data_vc, label_vc,data_vc_test, label_vc_test, 400)
rf = random_forest(data_vc, label_vc, data_vc_test, label_vc_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [204]:
vc_one_hot_encoding = [isof, lof, ocsvm, gb, rf]

In [205]:
vc_one_hot_encoding

[(1.0, 0.7885482808752812, 0.8817746653049597, 0.5),
 (0.735171870492404,
  0.7107372292704748,
  0.722096371412523,
  0.5321382298574971),
 (0.5197839800223155,
  0.5113013656075649,
  0.4670102051240022,
  0.5196916637986511),
 (0.985384090513331,
  0.9843185009381562,
  0.9845308749265943,
  0.9636290458211629),
 (0.9712994543220382,
  0.9657914683698448,
  0.9669299381272497,
  0.9191963150769994)]

In [13]:
isof = isolation_forest(data_ic, label_ic,data_ic_test, label_ic_test, 1000)
lof = local_outlier_factor(data_ic, label_ic,data_ic_test, label_ic_test, 3)
ocsvm = one_class_svm(data_ic, label_ic,data_ic_test, label_ic_test, 'linear')
gb = gradient_boosting(data_ic, label_ic,data_ic_test, label_ic_test, 400)
rf = random_forest(data_ic, label_ic, data_ic_test, label_ic_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [14]:
ic_one_hot_encoding = [isof, lof, ocsvm, gb, rf]

In [15]:
ic_one_hot_encoding

[(1.0, 0.8884498109525552, 0.9409302866295516, 0.5),
 (1.0, 0.8884498109525552, 0.9409302866295516, 0.5),
 (0.5072508907936114,
  0.504329796316624,
  0.4160961438634655,
  0.5072649912307322),
 (0.9993214247446774,
  0.8885961702646664,
  0.9405362298889576,
  0.5013252906321305),
 (0.9999237696372455,
  0.8884498109525552,
  0.9408828773414721,
  0.50009560944865)]

**GEL Encoding**

In [206]:
data_vc = load_data('../../data/vehicle_claims/train_gel.csv')
data_vi = load_data('../../data/vehicle_insurance/train_gel.csv')
data_ci = load_data('../../data/car_insurance/train_gel.csv')
data_ic = load_data('../../data/insurance_claim_kaggle/train_gel.csv')

In [207]:
data_ci_test = load_data('../../data/car_insurance/test_gel.csv')
data_vi_test = load_data('../../data/vehicle_insurance/test_gel.csv')
data_vc_test = load_data('../../data/vehicle_claims/test_gel.csv')
data_ic_test = load_data('../../data/insurance_claim_kaggle/test_gel.csv')

In [208]:
label_vc = load_data('../../data/vehicle_claims/train_gel_Y.csv')
label_vi = load_data('../../data/vehicle_insurance/train_gel_Y.csv')
label_ci = load_data('../../data/car_insurance/train_gel_Y.csv')
label_ic = load_data('../../data/insurance_claim_kaggle/train_gel_Y.csv')

In [209]:
label_vc_test = load_data('../../data/vehicle_claims/test_gel_Y.csv')
label_vi_test = load_data('../../data/vehicle_insurance/test_gel_Y.csv')
label_ci_test = load_data('../../data/car_insurance/test_gel_Y.csv')
label_ic_test = load_data('../../data/insurance_claim_kaggle/test_gel_Y.csv')

In [167]:
isof = isolation_forest(data_ci, label_ci, data_ci_test, label_ci_test, 1000)
lof = local_outlier_factor(data_ci, label_ci, data_ci_test, label_ci_test, 2)
ocsvm = one_class_svm(data_ci, label_ci, data_ci_test, label_ci_test, 'linear')
gb = gradient_boosting(data_ci, label_ci, data_ci_test, label_ci_test, 400)
rf = random_forest(data_ci, label_ci, data_ci_test, label_ci_test, 200)

  y = column_or_1d(y, warn=True)


[LibSVM]

  clf.fit(train_data, train_label)


In [168]:
ci_gel_encoding = [isof, lof, ocsvm, gb, rf]

In [169]:
ci_gel_encoding

[(0.7418674242424242,
  0.6433333333333333,
  0.6862928391909223,
  0.4664772727272727),
 (0.8403636363636363, 0.68, 0.7480893300248139, 0.47954545454545455),
 (0.43100000000000005, 0.46, 0.4291599020125847, 0.42500000000000004),
 (0.7863636363636364, 0.7, 0.7347966252829412, 0.5409090909090909),
 (0.9294772727272729, 0.73, 0.8094178278297385, 0.5215909090909091)]

In [170]:
isof = isolation_forest(data_vi, label_vi,data_vi_test, label_vi_test, 1000)
lof = local_outlier_factor(data_vi, label_vi,data_vi_test, label_vi_test, 3)
ocsvm = one_class_svm(data_vi, label_vi,data_vi_test, label_vi_test, 'linear')
gb = gradient_boosting(data_vi, label_vi,data_vi_test, label_vi_test, 400)
rf = random_forest(data_vi, label_vi, data_vi_test, label_vi_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [171]:
vi_gel_encoding = [isof, lof, ocsvm, gb, rf]

In [172]:
vi_gel_encoding

[(0.8537291167676179,
  0.869433635970601,
  0.8613712348224821,
  0.5075158525200355),
 (0.989364002555758,
  0.933852140077821,
  0.9606298987124169,
  0.5008596127499121),
 (0.5229450930157767,
  0.5045395590142672,
  0.38677509518644176,
  0.522908861649632),
 (0.9960145654495018,
  0.937094682230869,
  0.9654492902529932,
  0.5025873252585507),
 (1.0, 0.9383916990920882, 0.9682167949146872, 0.5)]

In [210]:
isof = isolation_forest(data_vc, label_vc,data_vc_test, label_vc_test, 1000)
lof = local_outlier_factor(data_vc, label_vc,data_vc_test, label_vc_test, 3)
ocsvm = one_class_svm(data_vc, label_vc,data_vc_test, label_vc_test, 'linear')
gb = gradient_boosting(data_vc, label_vc,data_vc_test, label_vc_test, 400)
rf = random_forest(data_vc, label_vc, data_vc_test, label_vc_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [211]:
vc_gel_encoding = [isof, lof, ocsvm, gb, rf]

In [212]:
vc_gel_encoding

[(0.9470496608338274,
  0.7902133528834325,
  0.8544792728948711,
  0.5331388611018205),
 (0.7320470462837501,
  0.7090473054413062,
  0.719779360972978,
  0.5314322465876943),
 (0.5197839800223155,
  0.5113013656075649,
  0.4670102051240022,
  0.5196916637986511),
 (0.9290629884928705,
  0.8526162754575841,
  0.8774181597808889,
  0.6728484428754168),
 (0.9524308173144002,
  0.9367148377797383,
  0.9404917789843109,
  0.8563119695787795)]

In [173]:
isof = isolation_forest(data_ic, label_ic,data_ic_test, label_ic_test, 1000)
lof = local_outlier_factor(data_ic, label_ic,data_ic_test, label_ic_test, 3)
ocsvm = one_class_svm(data_ic, label_ic,data_ic_test, label_ic_test, 'linear')
gb = gradient_boosting(data_ic, label_ic,data_ic_test, label_ic_test, 400)
rf = random_forest(data_ic, label_ic, data_ic_test, label_ic_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [174]:
ic_gel_encoding = [isof, lof, ocsvm, gb, rf]

In [175]:
ic_gel_encoding

[(0.8196468640535114,
  0.8184168801073302,
  0.8190299604328836,
  0.5386043179507299),
 (0.9999237696372455,
  0.8884498109525552,
  0.9408828773414721,
  0.50009560944865),
 (0.5071145833648478,
  0.5042566166605683,
  0.4160118670030334,
  0.5071281978822415),
 (0.9988280322700596,
  0.8883278448591292,
  0.9401816749648784,
  0.5010786735507311),
 (0.9995429780587658,
  0.8884498109525552,
  0.9406464700995993,
  0.5005736566918995)]

**Numerical features**

In [213]:
data_vc = load_data('../../data/vehicle_claims/train.csv')
data_vi = load_data('../../data/vehicle_insurance/train.csv')
data_ci = load_data('../../data/car_insurance/train.csv')
data_ic = load_data('../../data/insurance_claim_kaggle/train.csv')

In [214]:
data_ci = remove_cols(data_ci, ci_categorical_cols)
data_vi = remove_cols(data_vi, vi_categorical_cols)
data_vc = remove_cols(data_vc, vc_categorical_cols)
data_ic = remove_cols(data_ic, ic_categorical_cols)

In [215]:
data_ci_test = load_data('../../data/car_insurance/test.csv')
data_vi_test = load_data('../../data/vehicle_insurance/test.csv')
data_vc_test = load_data('../../data/vehicle_claims/test.csv')
data_ic_test = load_data('../../data/insurance_claim_kaggle/test.csv')

In [216]:
data_ci_test = remove_cols(data_ci_test, ci_categorical_cols)
data_vi_test = remove_cols(data_vi_test, vi_categorical_cols)
data_vc_test = remove_cols(data_vc_test, vc_categorical_cols)
data_ic_test = remove_cols(data_ic_test, ic_categorical_cols)

In [217]:
label_vc = load_data('../../data/vehicle_claims/train_Y.csv')
label_vi = load_data('../../data/vehicle_insurance/train_Y.csv')
label_ci = load_data('../../data/car_insurance/train_Y.csv')
label_ic = load_data('../../data/insurance_claim_kaggle/train_Y.csv')

In [218]:
label_vc_test = load_data('../../data/vehicle_claims/test_Y.csv')
label_vi_test = load_data('../../data/vehicle_insurance/test_Y.csv')
label_ci_test = load_data('../../data/car_insurance/test_Y.csv')
label_ic_test = load_data('../../data/insurance_claim_kaggle/test_Y.csv')

In [183]:
isof = isolation_forest(data_ci, label_ci, data_ci_test, label_ci_test, 1000)
lof = local_outlier_factor(data_ci, label_ci, data_ci_test, label_ci_test, 2)
ocsvm = one_class_svm(data_ci, label_ci, data_ci_test, label_ci_test, 'linear')
gb = gradient_boosting(data_ci, label_ci, data_ci_test, label_ci_test, 400)
rf = random_forest(data_ci, label_ci, data_ci_test, label_ci_test, 200)

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


[LibSVM]

In [184]:
ci_numerical = [isof, lof, ocsvm, gb, rf]

In [185]:
ci_numerical

[(0.4695757575757576,
  0.4266666666666667,
  0.3991666666666668,
  0.45795454545454545),
 (0.8403636363636363, 0.68, 0.7480893300248139, 0.47954545454545455),
 (0.43100000000000005, 0.46, 0.4291599020125847, 0.42500000000000004),
 (0.8489583333333334,
  0.6966666666666667,
  0.7592592592592592,
  0.5028409090909091),
 (0.970340909090909,
  0.7299999999999999,
  0.8294574528840664,
  0.5056818181818181)]

In [186]:
isof = isolation_forest(data_vi, label_vi,data_vi_test, label_vi_test, 1000)
lof = local_outlier_factor(data_vi, label_vi,data_vi_test, label_vi_test, 3)
ocsvm = one_class_svm(data_vi, label_vi,data_vi_test, label_vi_test, 'linear')
gb = gradient_boosting(data_vi, label_vi,data_vi_test, label_vi_test, 400)
rf = random_forest(data_vi, label_vi, data_vi_test, label_vi_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [187]:
vi_numerical = [isof, lof, ocsvm, gb, rf]

In [188]:
vi_numerical

[(0.7643999570234774,
  0.1805015131863381,
  0.13146154789661912,
  0.49942045854096195),
 (0.9794743743991807,
  0.9293125810635539,
  0.9535279428128297,
  0.5000800203688212),
 (0.5229450930157767,
  0.5045395590142672,
  0.38677509518644176,
  0.522908861649632),
 (0.999553518652473,
  0.9381755296152183,
  0.967892443557971,
  0.4998848191660908),
 (0.9876553728434808,
  0.9336359706009512,
  0.9595508169730057,
  0.504022842178009)]

In [219]:
isof = isolation_forest(data_vc, label_vc,data_vc_test, label_vc_test, 1000)
lof = local_outlier_factor(data_vc, label_vc,data_vc_test, label_vc_test, 3)
ocsvm = one_class_svm(data_vc, label_vc,data_vc_test, label_vc_test, 'linear')
gb = gradient_boosting(data_vc, label_vc,data_vc_test, label_vc_test, 400)
rf = random_forest(data_vc, label_vc, data_vc_test, label_vc_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [220]:
vc_numerical = [isof, lof, ocsvm, gb, rf]

In [221]:
vc_numerical

[(0.9210959547648231,
  0.8071498689066441,
  0.8503386917774254,
  0.5835946954734148),
 (0.7314918834548403,
  0.7087242317680829,
  0.7193548331743467,
  0.5312488965401406),
 (0.5197839800223155,
  0.5113013656075649,
  0.4670102051240022,
  0.5196916637986511),
 (0.945708679769342,
  0.8559588453843956,
  0.8845010407397714,
  0.6660439757680261),
 (0.9094450314562023,
  0.8718888626564112,
  0.8839140931597926,
  0.737924104778631)]

In [189]:
isof = isolation_forest(data_ic, label_ic,data_ic_test, label_ic_test, 1000)
lof = local_outlier_factor(data_ic, label_ic,data_ic_test, label_ic_test, 3)
ocsvm = one_class_svm(data_ic, label_ic,data_ic_test, label_ic_test, 'linear')
gb = gradient_boosting(data_ic, label_ic,data_ic_test, label_ic_test, 400)
rf = random_forest(data_ic, label_ic, data_ic_test, label_ic_test, 200)



[LibSVM]

  y = column_or_1d(y, warn=True)
  clf.fit(train_data, train_label)


In [190]:
ic_numerical = [isof, lof, ocsvm, gb, rf]

In [191]:
ic_numerical

[(0.6538124254347899,
  0.7146717872128583,
  0.6747398503450756,
  0.5225315972937309),
 (0.9988711027267394,
  0.8875613398762535,
  0.9398929587906114,
  0.5000371463658345),
 (0.5121895718005051,
  0.5051561055401464,
  0.4174329722673293,
  0.5121966697939787),
 (0.9989515080980292,
  0.8877035772704645,
  0.9399689955283843,
  0.500394651533527),
 (0.9998487972983052,
  0.8879169333617808,
  0.9405644604133684,
  0.49995995514976777)]

**Explore data**