In [305]:
socar_df = pd.read_csv('insurance_fraud_detect_data.csv')

socar = socar_df.copy()
socar.tail()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,...,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
15995,0,2,0,2,0,2,0,100,2,0,...,0,1,1,0,1,0,0,0,-1,0
15996,0,2,0,2,1,4,0,100,1,1,...,0,2,1,0,1,0,0,0,-1,0
15997,0,2,1,2,0,4,0,100,1,0,...,0,0,0,0,1,0,0,0,-1,1
15998,0,2,0,2,0,2,0,100,2,0,...,0,4,1,0,1,0,0,0,-1,0
15999,0,2,0,2,0,1,0,100,1,0,...,1188750,2,0,0,1,0,0,0,-1,0


In [306]:
socar_tr = socar[socar.test_set == 0].drop(['test_set'], axis=1)
socar_test = socar[socar.test_set == 1].drop(['test_set'], axis=1)

In [307]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler)
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score)

class MachineRunning:
    
    def __init__(self, random_state):
        self.random_state = random_state
        
        self.num_attribs = ['accident_ratio', 'repair_cost', 'insure_cost']
        
        self.lg_clf = LogisticRegression(random_state=random_state)
        self.dt_clf = DecisionTreeClassifier(random_state=random_state, max_depth=2)
        self.rf_clf = RandomForestClassifier(random_state=random_state, n_jobs=-1, n_estimators=100)
        self.lgbm_clf = LGBMClassifier(random_state=random_state, n_estimators=200, num_leaves=16, n_jobs=-1, boost_from_average=False)
        self.svm_clf = LinearSVC(random_state=random_state)
        
        self.lg_params=[{'C':[0.1, 0.5, 1]}]
        self.dt_params= [{'max_depth': [2, 4, 8, 16]}]
        self.rf_params = [{'n_estimators': [50, 100, 200, 400]}]
        self.lgbm_params = [{'n_estimators': [50, 100, 200, 400], 'num_leaves': [4, 8, 16, 32]}]
        self.svm_params = [{'C':[0.1, 0.5, 1]}]
        
        self.models = [(self.lg_clf, self.lg_params), (self.dt_clf, self.dt_params), (self.rf_clf, self.rf_params), 
                       (self.lgbm_clf, self.lgbm_params), (self.svm_clf, self.svm_params)]
        self.model_names = ['LogisticRegression','LogisticRegression(test)','DecisionTree','DecisionTree(test)',
                   'RandomForest','RandomForest(test)','LightGBM','LightGBM(test)', 'SVM', 'SVM(test)']
        self.col_names = ['accuracy','precision','recall','f1','roc_auc']    
        
        self.tmp = []
        
    def drop_cols(self, columns, train_data, test_data):
        train_data = train_data.drop(columns, axis=1) 
        test_data = test_data.drop(columns, axis=1)
        
        return train_data, test_data
        
    def split_dataset(self, train_data, test_data):
        X = train_data.drop('fraud_YN', axis=1)
        y = train_data.fraud_YN
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.random_state, stratify=y)

        X_eval = test_data.drop('fraud_YN', axis=1)
        y_eval = test_data.fraud_YN
        
        return X_train, X_test, y_train, y_test, X_eval, y_eval
        
    def scaling(self, scaler, columns, X_train, X_test, X_eval):
        num_attribs = [attrib for attrib in self.num_attribs if attrib not in columns]

        scaler_obj = scaler
        scaler_obj.fit(X_train[num_attribs])

        X_train[num_attribs] = scaler_obj.transform(X_train[num_attribs])
        X_test[num_attribs] = scaler_obj.transform(X_test[num_attribs])
        X_eval[num_attribs] = scaler_obj.transform(X_eval[num_attribs])
        
        return X_train, X_test, X_eval
        
    def one_hot_encoding(self, columns, X_train, X_test, X_eval):
        cat_attribs = [attrib for attrib in X_train.columns if attrib not in columns + self.num_attribs]
                
        X_train = pd.get_dummies(X_train, columns=cat_attribs)
        X_test = pd.get_dummies(X_test, columns=cat_attribs)
        X_eval = pd.get_dummies(X_eval, columns=cat_attribs)
        
        return X_train, X_test, X_eval
                       
    def sampling(self, sampling, X_train, y_train):
        spl = sampling(random_state=self.random_state)
        X_train, y_train = spl.fit_sample(X_train, y_train)  
        
        return X_train, y_train
    
    def get_pca_data(self, data, random_state, n_components=2):
        pca = PCA(n_components=n_components, random_state=self.random_state)
        pca.fit(data)

        return pca.transform(data), pca
    
    def pca(self, n_components, X_train, X_test, X_eval):
        X_train, pca1 = self.get_pca_data(X_train, self.random_state, n_components)
        X_test, pca2 = self.get_pca_data(X_test, self.random_state, n_components)
        X_eval, pca3 = self.get_pca_data(X_eval, self.random_state, n_components)
        
        return X_train, X_test, X_eval

    def get_clf_eval(self, y_test, pred):
        acc = accuracy_score(y_test, pred)
        pre = precision_score(y_test, pred)
        re = recall_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)

        return acc, pre, re, f1, auc
    
    def get_result_pd(self, X_train, y_train, X_test, y_test, test_data_X, test_data_y):
        for model, param in self.models:
            # Modeling
            grid = GridSearchCV(model, param, cv=5, scoring = 'recall')
            grid.fit(X_train, y_train)
            pred = grid.predict(X_test)
            self.tmp.append(self.get_clf_eval(y_test, pred))
            # predict test dataset
            pred_test = grid.predict(test_data_X)
            self.tmp.append(self.get_clf_eval(test_data_y, pred_test))

        df = pd.DataFrame(self.tmp, columns=self.col_names, index=self.model_names)
        df = df.style.applymap(self.color)

        return df
    
    def color(self, val):
        color = 'orange' if val > 0.6 else 'black'
        return 'color: %s' % color

In [308]:
test = MachineRunning(random_state=13)

In [309]:
train_data, test_data = test.drop_cols(['repair_cost'], socar_tr, socar_test)
X_train, X_test, y_train, y_test, X_eval, y_eval = test.split_dataset(train_data, test_data)
X_train, X_test, X_eval = test.scaling(StandardScaler(), ['repair_cost'], X_train, X_test, X_eval)
X_train, X_test, X_eval = test.one_hot_encoding(['repair_cost'], X_train, X_test, X_eval)
X_train, y_train = test.sampling(SMOTE, X_train, y_train)
X_train, X_test, X_eval = test.pca(2, X_train, X_test, X_eval)
df = test.get_result_pd(X_train, y_train, X_test, y_test, X_eval, y_eval)
df

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
LogisticRegression,0.728649,0.007123,0.714286,0.014104,0.721487
LogisticRegression(test),0.611343,0.001653,0.285714,0.003287,0.448894
DecisionTree,0.187888,0.002387,0.714286,0.004757,0.45037
DecisionTree(test),0.623839,0.001708,0.285714,0.003396,0.455156
RandomForest,0.923137,0.0,0.0,0.0,0.462826
RandomForest(test),0.9372,0.005236,0.142857,0.010101,0.540921
LightGBM,0.687888,0.001252,0.142857,0.002481,0.416115
LightGBM(test),0.723486,0.002326,0.285714,0.004614,0.505092
SVM,0.716227,0.006812,0.714286,0.013495,0.715259
SVM(test),0.597885,0.002392,0.428571,0.004758,0.513419
