In [1]:
import os
import sys
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn import tree, metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from model.DRST import DRST

# https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
# https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf
from model.Algorithms.ChiMerge import ChiMerge

# https://github.com/MengChiehLiu/Entropy-Based-Binning/blob/main/entropy.ipynb
from model.Algorithms.EntropyBasedBinning import cal_entropy, get_bin

In [9]:
current_dir = os.getcwd()
disc_datasets = {}
test_ration = {'20-80': [20, 80], '30-70': [30, 70], '40-60': [40, 60]}

data = pd.read_csv('datasets/health_insurance.csv')
data = data.sample(1000, random_state=41) # 41 # 600 1000

In [16]:
ex_c_c = ['Annual_Premium']
def encoder(d):
    le = LabelEncoder()
    encoder_dict = dict()
    data_endocded = d.copy()

    for col in data_endocded:
        if col not in ex_c_c:
            le = le.fit(data_endocded[col])
            data_endocded[col] = le.transform(data_endocded[col])
            encoder_dict[col] = le
    return data_endocded

In [17]:
drst_test_data = data.copy()

drst = DRST(comb_max_depth=3, decision_column_name='Response', save_output=False)
drst_fit, continuous_columns = drst.fit(drst_test_data, continous_columns=['Annual_Premium'])
drst_fit.drop(continuous_columns, axis=1, inplace=True)
disc_datasets.update({
    'drst': encoder(drst_fit)
})

Annual_Premium    3
dtype: int64


In [18]:
chi_test_data = data.copy()
# chi_test_data.drop(continuous_columns, axis=1, inplace=True)

chi_test_bins = ChiMerge(chi_test_data[['Annual_Premium', 'Response']], 'Annual_Premium', 'Response', confidenceVal=1.4, bin=4, sample=None)
chi_test_bins = chi_test_bins.interval.values.tolist()

chi_test_data['Annual_Premium'] = np.digitize(chi_test_data['Annual_Premium'], chi_test_bins)

print("bins result: %s" % ' '.join(str(i) for i in chi_test_bins))
print(len(chi_test_bins))
disc_datasets.update({
    'chi': encoder(chi_test_data)
})

bins result: 0.0 23595.0 27284.0 27707.0
4


In [19]:
ent_test_data = data.copy()
# ent_test_data.drop(continuous_columns, axis=1, inplace=True)

part_LeafNodes, part_entropy_list, LeafNodes, entropy_list = cal_entropy(ent_test_data, 'Annual_Premium', 'Response')
ent_test_bins = get_bin(part_LeafNodes)[1]

ent_test_data['Annual_Premium'] = np.digitize(ent_test_data['Annual_Premium'], ent_test_bins)

print("bins result: %s" % ' '.join(str(i) for i in ent_test_bins))
print(len(ent_test_bins))
disc_datasets.update({
    'ent': encoder(ent_test_data)
})

bins result: 0 34388.0 34489.0 34813.0 127772.0
5


In [20]:
def Test_RFC(test_split='20-80', print_flag=False):
    '''
        test_split is the percentage of the splitting for the data, which train_split =  100 - test_split
    '''
    for name, dataset in disc_datasets.items():
        # Scraping info of dataset from datasets variable
        data = dataset.copy()
        class_name = 'Response'
        regex_name = name

        
        file = open('%s\\output\\compare\\accuracy_result.txt' % (current_dir), 'a+')
        
        Path('%s\\output\\compare\\RandomForestClassifier\\%s' % (current_dir, test_split)).mkdir(parents=True, exist_ok=True)

        feature_names = [i for i in data.columns if i != class_name]
        X, y = data[feature_names], data[class_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=0)
        # clf = RandomForestClassifier(max_depth=2, random_state=0)
        clf = RandomForestClassifier(n_estimators = 100, random_state=21) 
        
        # Training the model on the training dataset
        # fit function is used to train the model using the training sets as parameters
        clf.fit(X_train, y_train)
        
        # performing predictions on the test dataset
        y_pred = clf.predict(X_test)
        
        # metrics are used to find accuracy or error
        report = classification_report(y_test, y_pred, output_dict=True)
        report.update({'Rules Number': len(tree.export_text(clf.estimators_[0]))})
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv('%s\\output\\compare\\RandomForestClassifier\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, test_split, regex_name, name))
        file.write("ACCURACY for data %s in model RandomForestClassifier with split ration %s === %s\n" % (name, test_split,metrics.accuracy_score(y_test, y_pred)))
        if print_flag:
            # using metrics module for accuracy calculation
            print('------------------------')
            print(name)
            print('------------------------')
            print("Number of rules Extracted from the model: %s" % len(tree.export_text(clf.estimators_[0])))
            print("ACCURACY: %s" % (metrics.accuracy_score(y_test, y_pred)))
            print("CLASSIFICATION REPORT \n %s" % report)

        tree.plot_tree(clf.estimators_[0],
                    feature_names = feature_names, 
                    class_names=class_name,
                    filled = True)
        plt.savefig('%s\\output\\compare\\RandomForestClassifier\\%s\\CLFTester-%s-%s.png' % (current_dir, test_split, regex_name, name))
        plt.close()
    file.write('\n')
    file.close()

In [21]:
# Path('%s\\output\\compare' % (current_dir)).mkdir()
Path('%s\\output\\compare\\accuracy_result.txt' % (current_dir)).touch(exist_ok=True)
file = open('%s\\output\\compare\\accuracy_result.txt' % (current_dir), 'a+')
file.write('___________________________________________________\n')
file.write('All RandomForestClassifier Results\n')
file.write('___________________________________________________\n')
file.close()
for tp in test_ration.keys():
    # Health Insurance Data Tester
    Test_RFC(test_split=tp)