In [2]:
import numpy as np
import pandas as pd
from utils import *
import os
from sklearn.model_selection import train_test_split

In [24]:
vehicle_claims = load_data('../../data/vehicle_claims/vehicle_claims_labeled.csv')
vehicle_insurance = load_data('../../data/vehicle_insurance/fraud_oracle.csv')
car_insurance = load_data('../../data/car_insurance/insurance_claims.csv')
insurance_claims = load_data('../../data/insurance_claim_kaggle/ic_claims.csv')

In [20]:
label_vc = get_labels(vehicle_claims, "vehicle_claims")
label_vi = get_labels(vehicle_insurance, "vehicle_insurance")
label_ci = get_labels(car_insurance, "car_insurance")
label_ic = insurance_claims['target']

In [21]:
vc_categorical_cols = ['Maker', 'Reg_year', ' Genmodel', 'Color', 'Bodytype', 'Engin_size', 'Gearbox', 'Fuel_type',
                        'Seat_num', 'Door_num', 'issue', 'issue_id', 'repair_complexity']
vi_categorical_cols = ['Make', 'AccidentArea',	'Sex',	'MaritalStatus',	'Fault', 'PolicyType',
                    	'VehicleCategory',	'Deductible',	'Days_Policy_Accident',	'Days_Policy_Claim',
            	        'AgeOfVehicle',  'AgeOfPolicyHolder', 'PoliceReportFiled',	'WitnessPresent',
                	   'AgentType',	'NumberOfSuppliments',	'AddressChange_Claim', 'VehiclePrice',
                       'PastNumberOfClaims', 'NumberOfCars', 'BasePolicy', 'Month', 'MonthClaimed',
                       'DayOfWeek', 'DayOfWeekClaimed']

ci_categorical_cols = ['policy_state', 'umbrella_limit', 'insured_sex', 'insured_education_level',
    	'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type',
        'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',	
        'property_damage', 'police_report_available', 'auto_make', 'auto_model']
ic_categorical_cols = ['ps_ind_02_cat','ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat']

In [22]:
vc_cols_to_remove = [' Genmodel_ID', 'Adv_year', 'Adv_month', 'Adv_day', 'breakdown_date', 'repair_date', 'category_anomaly']
ci_cols_to_remove = ['policy_number', 'policy_bind_date', 'policy_csl', 'incident_location', 'incident_date', '_c39']
ic_cols_to_remove = ['id', 'target']

In [16]:
car_insurance = remove_cols(car_insurance, ci_cols_to_remove)
vehicle_claims = remove_cols(vehicle_claims, vc_cols_to_remove)
insurance_claims = remove_cols(insurance_claims, ic_cols_to_remove)

In [8]:
def save_train_test_data(data, labels, path, cat_cols):
    train, test, Y_train, Y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
    train.to_csv(os.path.join(path, "train.csv"), index=False)
    test.to_csv(os.path.join(path, "test.csv"), index=False)
    Y_train.to_csv(os.path.join(path, "train_Y.csv"), index=False)
    Y_test.to_csv(os.path.join(path, "test_Y.csv"), index=False)
    data_label = label_encoding(data, cat_cols)
    train, test, Y_train, Y_test = train_test_split(data_label, labels, test_size=0.3, random_state=42)
    train.to_csv(os.path.join(path, "train_label.csv"), index=False)
    test.to_csv(os.path.join(path, "test_label.csv"), index=False)
    Y_train.to_csv(os.path.join(path, "train_label_Y.csv"), index=False)
    Y_test.to_csv(os.path.join(path, "test_label_Y.csv"), index=False)

In [9]:
def save_train_test_data_OH(data, labels, path, cat_cols):
    data_OH = one_hot_encoding(data, cat_cols)
    train, test, Y_train, Y_test = train_test_split(data_OH, labels, test_size=0.3, random_state=42)
    train.to_csv(os.path.join(path, "train_OH.csv"), index=False)
    test.to_csv(os.path.join(path, "test_OH.csv"), index=False)
    Y_train.to_csv(os.path.join(path, "train_OH_Y.csv"), index=False)
    Y_test.to_csv(os.path.join(path, "test_OH_Y.csv"), index=False)

In [10]:
def save_train_test_data_gel(data, labels, path, cat_cols):
    data_gel = gel_encoding(data, cat_cols)
    train, test, Y_train, Y_test = train_test_split(data_gel, labels, test_size=0.3, random_state=42)
    train.to_csv(os.path.join(path, "train_gel.csv"), index=False)
    test.to_csv(os.path.join(path, "test_gel.csv"), index=False)
    Y_train.to_csv(os.path.join(path, "train_gel_Y.csv"), index=False)
    Y_test.to_csv(os.path.join(path, "test_gel_Y.csv"), index=False)

In [99]:
save_train_test_data(car_insurance, label_ci, '../../data/car_insurance', ci_categorical_cols )
save_train_test_data(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)
save_train_test_data(vehicle_claims, label_vc, '../../data/vehicle_claims', vc_categorical_cols)
save_train_test_data(insurance_claims, label_ic, '../../data/insurance_claim_kaggle', ic_categorical_cols)

In [107]:
save_train_test_data_OH(car_insurance, label_ci, '../../data/car_insurance', ci_categorical_cols )
save_train_test_data_OH(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)
save_train_test_data_OH(vehicle_claims, label_vc, '../../data/vehicle_claims', vc_categorical_cols)
save_train_test_data_OH(insurance_claims, label_ic, '../../data/insurance_claim_kaggle', ic_categorical_cols)

In [113]:
save_train_test_data_gel(car_insurance, label_ci, '../../data/car_insurance', ci_categorical_cols )
save_train_test_data_gel(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)
save_train_test_data_gel(vehicle_claims, label_vc, '../../data/vehicle_claims', vc_categorical_cols)
save_train_test_data_gel(insurance_claims, label_ic, '../../data/insurance_claim_kaggle', ic_categorical_cols)

In [None]:
def save_normal_data(data, path, name):
    repr(data)
    normal_data = get_normal_data(data, name)
    normal_data = normal_data.reset_index(drop=True)
    normal_data.to_csv(os.path.join(path, "normal_data.csv"), index=False)

In [None]:
save_normal_data(vehicle_claims, '../../data/vehicle_claims', "vehicle_claims")

In [17]:
save_train_test_data(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)

In [23]:
save_train_test_data_OH(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)

In [25]:
save_train_test_data_gel(vehicle_insurance, label_vi, '../../data/vehicle_insurance', vi_categorical_cols)