In [55]:
from model.DRST import DRST

import os
import warnings
import pandas as pd
from pathlib import Path

from sklearn import tree, metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [56]:
current_dir = os.getcwd()
# Path('%s\\output\\%s\\%s' % (current_dir, model, pat)).mkdir(parents=True, exist_ok=True)
# file1 = open('%s\\output\\%s\\%s\\result.txt' % (current_dir, model, pat), 'w+')

In [57]:
datasets = {
    'Health Insurance': {
        'path': 'datasets\\health_insurance.csv',
        'outputPath': 'output\\Health Insurance',
        'nameregex': 'health_insurance',
        'dataset_size': 'normal',
        'class_name': 'class',
    },
    'D2': {
        'path': 'datasets\\d2.csv',
        'outputPath': 'output\\D2',
        'nameregex': 'd2',
        'dataset_size': 'normal',
        'class_name': 'fraud_reported',
    },
    }

datasets_names = list(datasets.keys())
test_ration = {'20-80': [20, 80], '30-70': [30, 70], '40-60': [40, 60]}

In [58]:
ex_c_c = ['Annual_Premium', 'Vintage', 'Age']
def encoder(d):
    le = LabelEncoder()
    encoder_dict = dict()
    data_endocded = d.copy()

    for col in data_endocded:
        if col not in ex_c_c:
            le = le.fit(data_endocded[col])
            data_endocded[col] = le.transform(data_endocded[col])
            encoder_dict[col] = le
    return data_endocded

In [59]:

data = pd.read_csv(datasets['Health Insurance']['path'])
df = data.copy()
df = df.sample(350, random_state=41) # 41
# df.drop(['Vintage', 'Age'], axis=1, inplace=True)

Path('%s\\%s' % (current_dir, datasets['Health Insurance']['outputPath'])).mkdir(parents=True, exist_ok=True)

drst = DRST(comb_max_depth=1, decision_column_name='Response', output_loction=datasets['Health Insurance']['outputPath'])
drst_fit, continuous_columns = drst.fit(df, continous_columns=['Annual_Premium', 'Vintage', 'Age'])
drst_fit.drop(continuous_columns, axis=1, inplace=True)

datasets['Health Insurance'].update({
    'drop_column': ['Vintage', 'Age'],
    'class_name': 'Response',
    'data': df,
    'data_original': encoder(df),
    'data_drst': drst_fit
    })

Annual_Premium    3
Vintage           3
Age               4
dtype: int64


In [60]:
D2 = pd.read_csv(datasets['D2']['path'])
D2.drop(['months_as_customer', 'age', 'total_claim_amount'], axis=1, inplace=True)
drst1 = DRST(comb_max_depth=1, decision_column_name='fraud_reported', output_loction=datasets['D2']['outputPath'])
drst_fit1, continuous_columns= drst1.fit(D2, continous_columns=['policy_annual_premium'])
drst_fit1['policy_annual_premium'] = drst_fit1['policy_annual_premium_AFTER']
drst_fit1.drop('policy_annual_premium_AFTER', axis=1, inplace=True)

datasets['D2'].update({
            'drop_column': [],
            'data': D2,'data_original': encoder(D2),
            'data_drst': drst_fit1
            })

19
policy_annual_premium    7
dtype: int64


In [61]:
def Test_RFC(test_name, test_split='20-80', print_flag=False):
    '''
        test_split is the percentage of the splitting for the data, which train_split =  100 - test_split
    '''
    for mode in ['data_original', 'data_drst']:
        # Scraping info of dataset from datasets variable
        data = datasets[test_name][mode].copy()
        class_name = datasets[test_name]['class_name']
        regex_name = datasets[test_name]['nameregex']

        
        file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
        
        Path('%s\\output\\%s\\RandomForestClassifier\\%s' % (current_dir, test_name, test_split)).mkdir(parents=True, exist_ok=True)

        feature_names = [i for i in data.columns if i != class_name]
        X, y = data[feature_names], data[class_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=0)
        # clf = RandomForestClassifier(max_depth=2, random_state=0)
        clf = RandomForestClassifier(n_estimators = 100, random_state=21) 
        
        # Training the model on the training dataset
        # fit function is used to train the model using the training sets as parameters
        clf.fit(X_train, y_train)
        
        # performing predictions on the test dataset
        y_pred = clf.predict(X_test)
        
        # metrics are used to find accuracy or error
        report = classification_report(y_test, y_pred, output_dict=True)
        report.update({'Rules Number': len(tree.export_text(clf.estimators_[0]))})
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('%s\\output\\%s\\RandomForestClassifier\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, test_name, test_split, regex_name, mode))
        file.write("data %s:ACCURACY for data %s in model RandomForestClassifier with split ration %s === %s\n" % (test_name,mode,test_split,metrics.accuracy_score(y_test, y_pred)))
        if print_flag:
            # using metrics module for accuracy calculation
            print('------------------------')
            print(test_name)
            print('------------------------')
            print("Number of rules Extracted from the model: %s" % len(tree.export_text(clf.estimators_[0])))
            print("ACCURACY: %s" % (metrics.accuracy_score(y_test, y_pred)))
            print("CLASSIFICATION REPORT \n %s" % report)

        tree.plot_tree(clf.estimators_[0],
                    feature_names = feature_names, 
                    class_names=class_name,
                    filled = True)
        plt.savefig('%s\\output\\%s\\RandomForestClassifier\\%s\\CLFTester-%s-%s.png' % (current_dir, test_name, test_split, regex_name, mode))
        plt.close()
    file.write('\n')
    file.close()

In [62]:
def Test_DT(test_name, test_split='20-80', print_flag=False):
    '''
        test_split is the percentage of the splitting for the data, which train_split =  100 - test_split
    '''
    for mode in ['data_original', 'data_drst']:
        # Scraping info of dataset from datasets variable
        data = datasets[test_name][mode].copy()
        class_name = datasets[test_name]['class_name']
        regex_name = datasets[test_name]['nameregex']

        Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
        file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
        Path('%s\\output\\%s\\DecisionTrees\\%s' % (current_dir, test_name, test_split)).mkdir(parents=True, exist_ok=True)

        feature_names = [i for i in data.columns if i != class_name]
        X, y = data[feature_names], data[class_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=1)
        clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='random')
        
        # Training the model on the training dataset
        # fit function is used to train the model using the training sets as parameters
        clf.fit(X_train, y_train)
        
        # performing predictions on the test dataset
        y_pred = clf.predict(X_test)
        
        # metrics are used to find accuracy or error
        report = classification_report(y_test, y_pred, output_dict=True)
        report.update({'Rules Number': len(tree.export_text(clf))})
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('%s\\output\\%s\\DecisionTrees\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, test_name, test_split, regex_name, mode))
        file.write("data %s:ACCURACY for data %s in model DecisionTrees with split ration %s === %s\n" % (test_name,mode,test_split,metrics.accuracy_score(y_test, y_pred)))
        if print_flag:
            # using metrics module for accuracy calculation
            print('------------------------')
            print(test_name)
            print('------------------------')
            print("Number of rules Extracted from the model: %s" % len(tree.export_text(clf.estimators_[0])))
            print("ACCURACY: %s" % (metrics.accuracy_score(y_test, y_pred)))
            print("CLASSIFICATION REPORT \n %s" % report)

        tree.plot_tree(clf,
                    feature_names = feature_names, 
                    class_names=class_name,
                    filled = True)
        plt.savefig('%s\\output\\%s\\DecisionTrees\\%s\\CLFTester-%s-%s.png' % (current_dir, test_name, test_split, regex_name, mode))
        plt.close()
    file.write('\n')
    file.close()

In [63]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

def Test_XG(test_name, test_split='20-80', print_flag=False):
    '''
        test_split is the percentage of the splitting for the data, which train_split =  100 - test_split
    '''
    for mode in ['data_original', 'data_drst']:
        # Scraping info of dataset from datasets variable
        data = datasets[test_name][mode].copy()
        class_name = datasets[test_name]['class_name']
        regex_name = datasets[test_name]['nameregex']

        Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
        file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
        Path('%s\\output\\%s\\XGBRFClassifier\\%s' % (current_dir, test_name, test_split)).mkdir(parents=True, exist_ok=True)

        feature_names = [i for i in data.columns if i != class_name]
        X, y = data[feature_names], data[class_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=0)
        xg_reg = xgb.XGBRFClassifier()
        # Training the model on the training dataset
        # fit function is used to train the model using the training sets as parameters
        xg_reg.fit(X_train,y_train)
        # performing predictions on the test dataset
        y_pred = xg_reg.predict(X_test)
        
        # # metrics are used to find accuracy or error
        report = classification_report(y_test, y_pred, output_dict=True)
        report.update({'Rules Number': xg_reg._Booster.trees_to_dataframe().shape[0]})
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('%s\\output\\%s\\XGBRFClassifier\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, test_name, test_split, regex_name, mode))
        file.write("data %s:ACCURACY for data %s in model XGBRFClassifier with split ration %s === %s\n" % (test_name,mode,test_split,metrics.accuracy_score(y_test, y_pred)))
        if print_flag:
            # using metrics module for accuracy calculation
            print('------------------------')
            print(test_name)
            print('------------------------')
            print("Number of rules Extracted from the model: %s" % len(tree.export_text(xg_reg._Booster.trees_to_dataframe().shape[0])))
            print("ACCURACY: %s" % (metrics.accuracy_score(y_test, y_pred)))
            print("CLASSIFICATION REPORT \n %s" % report)
        
        fig, ax = plt.subplots(figsize=(30, 30))
        xgb.plot_tree(xg_reg, num_trees=4, ax=ax)
        # tree.plot_tree(clf,
        #             feature_names = feature_names, 
        #             class_names=class_name,
        #             filled = True)
        plt.savefig('%s\\output\\%s\\XGBRFClassifier\\%s\\CLFTester-%s-%s.png' % (current_dir, test_name, test_split, regex_name, mode))
        plt.close()
    file.write('\n')
    file.close()

In [64]:
from sklearn.naive_bayes import BernoulliNB

def Test_bnb(test_name, test_split='20-80', print_flag=False):
    '''
        test_split is the percentage of the splitting for the data, which train_split =  100 - test_split
    '''
    for mode in ['data_original', 'data_drst']:
        # Scraping info of dataset from datasets variable
        data = datasets[test_name][mode].copy()
        class_name = datasets[test_name]['class_name']
        regex_name = datasets[test_name]['nameregex']

        Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
        file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
        Path('%s\\output\\%s\\BernoulliNB\\%s' % (current_dir, test_name, test_split)).mkdir(parents=True, exist_ok=True)

        feature_names = [i for i in data.columns if i != class_name]
        X, y = data[feature_names], data[class_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=11)
        bnb = BernoulliNB()
        # Training the model on the training dataset
        # fit function is used to train the model using the training sets as parameters
        bnb.fit(X_train,y_train)
        # performing predictions on the test dataset
        y_pred = bnb.predict(X_test)
        
        # # metrics are used to find accuracy or error
        report = classification_report(y_test, y_pred, output_dict=True)
        # report.update({'Rules Number': bnb._Booster.trees_to_dataframe().shape[0]})
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('%s\\output\\%s\\BernoulliNB\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, test_name, test_split, regex_name, mode))
        file.write("data %s:ACCURACY for data %s in model BernoulliNB with split ration %s === %s\n" % (test_name,mode,test_split,metrics.accuracy_score(y_test, y_pred)))
    file.write('\n')
    file.close()


In [65]:
Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
file.write('___________________________________________________\n')
file.write('All RandomForestClassifier Results\n')
file.write('___________________________________________________\n')
file.close()
for tp in test_ration.keys():
    # Health Insurance Data Tester
    Test_RFC(datasets_names[0], test_split=tp)

    # Play Tennis Three Attrubite Data Tester
    Test_RFC(datasets_names[1], test_split=tp)
    
    # Play Tennis Four Attrubite Data Tester
    # Test_RFC(datasets_names[2], test_split=tp)

In [66]:
Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
file.write('___________________________________________________\n')
file.write('All DecisionTrees Results\n')
file.write('___________________________________________________\n')
file.close()
for tp in test_ration.keys():
    # Health Insurance Data Tester
    Test_DT(datasets_names[0], test_split=tp)

    # Play Tennis Three Attrubite Data Tester
    Test_DT(datasets_names[1], test_split=tp)
    
    # Play Tennis Four Attrubite Data Tester
    # Test_RFC(datasets_names[2], test_split=tp)

In [67]:
Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
file.write('___________________________________________________\n')
file.write('All XGBRFClassifier Results\n')
file.write('___________________________________________________\n')
file.close()
for tp in test_ration.keys():
    # Health Insurance Data Tester
    Test_XG(datasets_names[0], test_split=tp)

    # Play Tennis Three Attrubite Data Tester
    Test_XG(datasets_names[1], test_split=tp)
    
    # Play Tennis Four Attrubite Data Tester
    # Test_RFC(datasets_names[2], test_split=tp)

In [39]:
# Path('%s\\output\\accuracy_result.text' % (current_dir)).touch(exist_ok=True)
# file = open('%s\\output\\accuracy_result.text' % (current_dir), 'a+')
# file.write('___________________________________________________\n')
# file.write('All BernoulliNB Results\n')
# file.write('___________________________________________________\n')
# file.close()
# for tp in test_ration.keys():
#     # Health Insurance Data Tester
#     Test_bnb(datasets_names[0], test_split=tp)

#     # Play Tennis Three Attrubite Data Tester
#     Test_bnb(datasets_names[1], test_split=tp)
    
#     # Play Tennis Four Attrubite Data Tester
#     # Test_RFC(datasets_names[2], test_split=tp)