In [292]:
import re
import os
import time
import math
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import tree, metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, RocCurveDisplay
import matplotlib.pyplot as plt
from model.DRST import DRST

current_dir = os.getcwd()

In [303]:
def plot_comp(var, test):
    # create plot
    n_groups = len(results[var + '_0'])
    fig, ax = plt.subplots(figsize=(25,25))
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, results[var + '_1'], bar_width,
    alpha=opacity,
    color='r',
    label='RandomForestClassifier After DRST')

    rects2 = plt.bar(index + bar_width, results[var + '_0'], bar_width,
    alpha=opacity,
    color='g',
    label='RandomForestClassifier Before DRST')

    plt.xlabel(test)
    plt.ylabel('Data Status')
    plt.title(test + 'Between after and before DRST', fontsize=28)
    plt.xticks(index + bar_width, results['names_0'],fontsize=20, rotation=45 )
    plt.legend(loc=2, prop={'size': 26})

    plt.tight_layout()
    plt.savefig('%s\\output\\datasets\\%s-comaprision.png' % (current_dir, test))
    plt.close()

In [304]:
plot_comp('rule_number', 'Number of Rules')
plot_comp('time_c', 'Time Consumed')
plot_comp('accuracy', 'Accuracy')

In [2]:
ex_c_c = ['Annual_Premium', 'Vintage', 'Age']
def encoder(d):
    le = LabelEncoder()
    encoder_dict = dict()
    data_endocded = d.copy()

    for col in data_endocded:
        if col not in ex_c_c:
            le = le.fit(data_endocded[col])
            data_endocded[col] = le.transform(data_endocded[col])
            encoder_dict[col] = le
    return data_endocded

In [259]:
# datasets = {
#     'Health Insurance': {
#         'path': 'datasets/health_insurance.csv',
#         'orginal_data': None,
#         'encoded_data': None,
#         'shape': [0, 0],
#         'columns_name': [],
#         'drst_data': None,
#         'drst_continous_columns': [],
#         'drst_interval_number': [],
#     }
# }
datasets = ['datasets/bands.csv', 'datasets/crx.csv', 'datasets/d2.csv', 'datasets/hepatitis.csv', 'datasets/health_insurance1.csv', 'datasets/horse-colic.csv', 'datasets/play_tennis_three.csv']

test_ration = {'20-80': [20, 80], '30-70': [30, 70], '40-60': [40, 60]}
r_s = [[5, 8], [66,16], [1,1], [30,19], [30,29], [24,24], [3,3]]
c_c = [None, None, ['5'], None, ['7'], None, ['1']]

In [296]:
# import xgboost as xgb
results = {'names_0': [], 'accuracy_0': [], 'time_c_0': [], 'rule_number_0': [],'names_1': [], 'accuracy_1': [], 'time_c_1': [], 'rule_number_1': []}
for d_idx,path in enumerate(datasets):
    dataset_name = str.split(path, '/')[1].split('.')[0]
    if Path('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name)).is_file():
        data = pd.read_csv('%s\\output\\datasets\\%s\\%s-cleaned.csv' % (current_dir, dataset_name, dataset_name))
        drst_fit = pd.read_csv('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name))
        Path('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name)).touch(exist_ok=True)
        info_file = open('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name), 'a+')
    else:
        Path('%s\\output\\datasets\\%s' % (current_dir, dataset_name)).mkdir(parents=True,exist_ok=True)
        Path('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name)).touch(exist_ok=True)
        info_file = open('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name), 'a+')
        print('%s ====> LOADED' % dataset_name)

        data = pd.read_csv(path, header=None)
        data = data.replace('?', np.NaN)
        data.dropna(thresh=(data.shape[0] * .93), axis=1, inplace=True)
        data.dropna(thresh=(data.shape[1] * .93), axis=0, inplace=True)
        data.fillna(data.mode().iloc[0], inplace=True)
        data.columns = [str(c) for c in data.columns]

        data.to_csv('%s\\output\\datasets\\%s\\%s-cleaned.csv' % (current_dir, dataset_name, dataset_name), index=False)
        info_file.write('Number of objects === %s\n' % data.shape[0])
        info_file.write('Number of features === %s\n' % data.shape[1])
        drst = DRST(comb_max_depth=1, decision_column_name=data.columns[-1], topN_thrshold=.7, save_output=False)
        
        drst_start = time.time()
        drst_fit, continuous_columns = drst.fit(data, ensamble_threshold=0.9, continous_columns=c_c[d_idx])
        drst_stop = time.time()
        time_c_drst = math.ceil((drst_stop - drst_start) * 1000)
        info_file.write('Time consumed for DRST model === %s\n' % time_c_drst)

        drst_fit.drop(continuous_columns, axis=1, inplace=True)
        cols = list(drst_fit.columns)
        cols = cols[-len(continuous_columns):] + cols[:-len(continuous_columns)]
        drst_fit = drst_fit[cols]
        drst_fit.to_csv('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name), index=False)

    data_list = [encoder(data), encoder(drst_fit)]
    clf_temp = None
    for test_split in test_ration.keys():
        for idx, d in enumerate(data_list):
            Path('%s\\output\\datasets\\%s.txt' % (current_dir, idx)).touch(exist_ok=True)
            all_info_file = open('%s\\output\\datasets\\%s.txt' % (current_dir, idx), 'a+')

            X, y = d.iloc[:,:-1], d.iloc[:,-1]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=r_s[d_idx][idx])
            # clf = xgb.XGBRFClassifier()
            clf = RandomForestClassifier(n_estimators = 10, max_depth=4 - idx, random_state=0) 
            # Training the model on the training dataset
            # fit function is used to train the model using the training sets as parameters
            clf_start = time.time()
            clf.fit(X_train, y_train)
            clf_stop = time.time()
            # performing predictions on the test dataset
            y_pred = clf.predict(X_test)
            
            accuracy = metrics.accuracy_score(y_test, y_pred)
            time_c = math.ceil((clf_stop - clf_start) * 1000)
            rule_number = len(tree.export_text(clf.estimators_[0]))
            results['names_' + str(idx)].append(dataset_name + '-' + test_split)
            results['accuracy_' + str(idx)].append(accuracy)
            results['time_c_' + str(idx)].append(time_c)
            results['rule_number_' + str(idx)].append(rule_number)
            # metrics are used to find accuracy or error
            report = classification_report(y_test, y_pred, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            report_df.to_csv('%s\\output\\datasets\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, dataset_name, idx, test_split), index=False)
            info_file.write("%s:%s RandomForestClassifier accuracy === %s with time === %sms\n" % (idx, test_split, accuracy, time_c))
            info_file.write('%s:%s Number of rules === %s\n' % (idx, test_split, rule_number))
            # info_file.write('%s:%s Number of rules === %s\n' % (idx, test_split, clf._Booster.trees_to_dataframe().shape[0]))
            all_info_file.write("%s:%s:RandomForestClassifier accuracy === %s\n" % (dataset_name, test_split, accuracy))
            if idx==0:
                clf_temp = clf
        all_info_file.write('-------------------------------------------------------\n')
        ax = plt.gca()
        svc_disp = RocCurveDisplay.from_estimator(clf_temp, X_test, y_test, name='before',ax=ax, alpha=0.8)
        rfc_disp = RocCurveDisplay.from_estimator(clf, X_test, y_test, name='after',ax=ax, alpha=0.8)
        svc_disp.plot(ax=ax, alpha=0.8)
        plt.savefig('%s\\output\\datasets\\%s\\ROC-%s-%s.png' % (current_dir, dataset_name,dataset_name, test_split))
        plt.close()
    all_info_file.close()
    info_file.close()


plot_comp('rule_number', 'Number of Rules')
plot_comp('time_c', 'Time Consumed')
plot_comp('accuracy', 'Accuracy')

bands ====> LOADED
0    5
3    4
dtype: int64


Feature names unseen at fit time:
- 0_AFTER
- 3_AFTER
Feature names seen at fit time, yet now missing:
- 0
- 3

Feature names unseen at fit time:
- 0_AFTER
- 3_AFTER
Feature names seen at fit time, yet now missing:
- 0
- 3

Feature names unseen at fit time:
- 0_AFTER
- 3_AFTER
Feature names seen at fit time, yet now missing:
- 0
- 3



crx ====> LOADED
1     3
2     3
7     3
13    8
14    3
dtype: int64


Feature names unseen at fit time:
- 13_AFTER
- 14_AFTER
- 1_AFTER
- 2_AFTER
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 1
- 13
- 14
- 2
- 7

Feature names unseen at fit time:
- 13_AFTER
- 14_AFTER
- 1_AFTER
- 2_AFTER
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 1
- 13
- 14
- 2
- 7

Feature names unseen at fit time:
- 13_AFTER
- 14_AFTER
- 1_AFTER
- 2_AFTER
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 1
- 13
- 14
- 2
- 7



d2 ====> LOADED
5    7
dtype: int64


Feature names unseen at fit time:
- 5_AFTER
Feature names seen at fit time, yet now missing:
- 5

Feature names unseen at fit time:
- 5_AFTER
Feature names seen at fit time, yet now missing:
- 5

Feature names unseen at fit time:
- 5_AFTER
Feature names seen at fit time, yet now missing:
- 5



hepatitis ====> LOADED
0    4
dtype: int64


Feature names unseen at fit time:
- 0_AFTER
Feature names seen at fit time, yet now missing:
- 0

Feature names unseen at fit time:
- 0_AFTER
Feature names seen at fit time, yet now missing:
- 0

Feature names unseen at fit time:
- 0_AFTER
Feature names seen at fit time, yet now missing:
- 0



health_insurance1 ====> LOADED
7    3
dtype: int64


Feature names unseen at fit time:
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 7

Feature names unseen at fit time:
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 7

Feature names unseen at fit time:
- 7_AFTER
Feature names seen at fit time, yet now missing:
- 7



horse-colic ====> LOADED
22    8
dtype: int64


Feature names unseen at fit time:
- 22_AFTER
Feature names seen at fit time, yet now missing:
- 22

Feature names unseen at fit time:
- 22_AFTER
Feature names seen at fit time, yet now missing:
- 22

Feature names unseen at fit time:
- 22_AFTER
Feature names seen at fit time, yet now missing:
- 22



play_tennis_three ====> LOADED
1    6
dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Feature names unseen at fit time:
- 1_AFTER
Feature names seen at fit time, yet now missing:
- 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Feature names unseen at fit time:
- 1_AFTER
Feature names seen at fit time, yet now missing:
- 1

Feature names unseen at fit time:
- 1_AFTER
Feature names seen at fit time, yet now missing:
- 1



In [229]:
for d_idx,path in enumerate(datasets):
    dataset_name = str.split(path, '/')[1].split('.')[0]
    data = pd.read_csv(path, header=None)
    print('class attruite blanceing ', end='')
    print(dataset_name, data.iloc[:,-1].value_counts(dropna=False).to_list())

class attruite blanceing bands [312, 227, 1]
class attruite blanceing crx [383, 307]
class attruite blanceing d2 [753, 247]
class attruite blanceing hepatitis [123, 32]
class attruite blanceing health_insurance1 [520, 480]
class attruite blanceing horse-colic [180, 119, 1]
class attruite blanceing play_tennis_three [9, 5]
