In [1]:
import re
import os
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import tree, metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from model.DRST import DRST

current_dir = os.getcwd()

In [2]:
ex_c_c = ['Annual_Premium', 'Vintage', 'Age']
def encoder(d):
    le = LabelEncoder()
    encoder_dict = dict()
    data_endocded = d.copy()

    for col in data_endocded:
        if col not in ex_c_c:
            le = le.fit(data_endocded[col])
            data_endocded[col] = le.transform(data_endocded[col])
            encoder_dict[col] = le
    return data_endocded

In [5]:
# datasets = {
#     'Health Insurance': {
#         'path': 'datasets/health_insurance.csv',
#         'orginal_data': None,
#         'encoded_data': None,
#         'shape': [0, 0],
#         'columns_name': [],
#         'drst_data': None,
#         'drst_continous_columns': [],
#         'drst_interval_number': [],
#     }
# }
datasets = ['datasets/bands.csv', 'datasets/crx.csv', 'datasets/d2.csv', 'datasets/hepatitis.csv', 'datasets/health_insurance.csv', 'datasets/horse-colic.csv', 'datasets/imports-85.csv']
test_ration = {'20-80': [20, 80], '30-70': [30, 70], '40-60': [40, 60]}

In [8]:

for path in datasets:
    dataset_name = str.split(path, '/')[1].split('.')[0]
    if Path('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name)).is_file():
        data = pd.read_csv('%s\\output\\datasets\\%s\\%s-cleaned.csv' % (current_dir, dataset_name, dataset_name), header=None)
        data = pd.read_csv('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name), header=None)
        Path('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name)).touch(exist_ok=True)
        info_file = open('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name), 'a+')
    else:
        Path('%s\\output\\datasets\\%s' % (current_dir, dataset_name)).mkdir(parents=True,exist_ok=True)
        Path('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name)).touch(exist_ok=True)
        info_file = open('%s\\output\\datasets\\%s\\info.txt' % (current_dir, dataset_name), 'a+')
        print('%s ====> LOADED' % dataset_name)

        data = pd.read_csv(path, header=None)
        data = data.replace('?', np.NaN)
        data.dropna(thresh=(data.shape[0] * .93), axis=1, inplace=True)
        data.dropna(thresh=(data.shape[1] * .93), axis=0, inplace=True)
        data.fillna(data.mode().iloc[0], inplace=True)
        data.columns = [str(c) for c in data.columns]

        data.to_csv('%s\\output\\datasets\\%s\\%s-cleaned.csv' % (current_dir, dataset_name, dataset_name), index=False)
        info_file.write('Number of objects = %s\n' % data.shape[0])
        info_file.write('Number of features = %s\n' % data.shape[1])

        drst = DRST(comb_max_depth=1,decision_column_name=data.columns[-1], topN_thrshold=.7, save_output=False)
        drst_fit, continuous_columns = drst.fit(data, ensamble_threshold=0.9)
        drst_fit.drop(continuous_columns, axis=1, inplace=True)
        cols = list(drst_fit.columns)
        cols = cols[-len(continuous_columns):] + cols[:-len(continuous_columns)]
        drst_fit = drst_fit[cols]
        drst_fit.to_csv('%s\\output\\datasets\\%s\\%s-drst.csv' % (current_dir, dataset_name, dataset_name), index=False)

    data_list = [encoder(data), encoder(drst_fit)]
    for test_split in test_ration.keys():
        for idx, d in enumerate(data_list):
            X, y = d.iloc[:,:-1], d.iloc[:,-1]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ration[test_split][0]/100, random_state=1)
            clf = RandomForestClassifier(n_estimators = 100, random_state=21) 
            # Training the model on the training dataset
            # fit function is used to train the model using the training sets as parameters
            clf.fit(X_train, y_train)
            
            # performing predictions on the test dataset
            y_pred = clf.predict(X_test)
            
            # metrics are used to find accuracy or error
            report = classification_report(y_test, y_pred, output_dict=True)
            # report.update({'Number of rules': len(tree.export_text(clf.estimators_[0]))})
            report_df = pd.DataFrame(report).transpose()
            report_df.to_csv('%s\\output\\datasets\\%s\\report-CLFTester-%s-%s.csv' % (current_dir, dataset_name, idx, test_split), index=False)
            info_file.write("%s:%s RandomForestClassifier accuracy === %s\n" % (idx, test_split, metrics.accuracy_score(y_test, y_pred)))
            info_file.write('%s:%s Number of rules === %s\n' % (idx, test_split, len(tree.export_text(clf.estimators_[0]))))
        
    info_file.close()


bands ====> LOADED
0    5
3    4
dtype: int64
crx ====> LOADED
1     3
2     3
7     3
13    8
14    3
dtype: int64
d2 ====> LOADED
0     3
1     3
5     7
14    8
18    3
20    8
dtype: int64
hepatitis ====> LOADED
0    4
dtype: int64
health_insurance ====> LOADED
0     3
2     3
4     8
8     3
10    3
dtype: int64
horse-colic ====> LOADED
22    8
dtype: int64
imports-85 ====> LOADED
8     8
9     7
10    6
11    7
12    3
15    8
dtype: int64


In [36]:
data = pd.read_csv('datasets/imports-85.csv', header=None)
data = data.replace('?', np.NaN)
data.dropna(thresh=(data.shape[0] * .93), axis=1, inplace=True)
data.dropna(thresh=(data.shape[1] * .93), axis=0, inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)
data.columns = [str(c) for c in data.columns]

# X, y = data.iloc[:,:-1], data.iloc[:,-1]
# X, y
drst = DRST(comb_max_depth=1, decision_column_name=data.columns[-1], topN_thrshold=1, save_output=False)
drst_fit, continuous_columns = drst.fit(data, ensamble_threshold=0.9)
drst_fit.drop(continuous_columns, axis=1, inplace=True)
drst_fit

8     8
9     7
10    6
11    7
12    3
15    8
19    8
22    8
23    8
dtype: int64


Unnamed: 0,0,2,3,4,5,6,7,13,14,16,...,25,8_AFTER,9_AFTER,10_AFTER,11_AFTER,12_AFTER,15_AFTER,19_AFTER,22_AFTER,23_AFTER
0,3,9,0,1,1,2,0,5,0,4,...,1,6,3,3,0,2,6,2,3,0
1,2,9,0,0,3,2,0,5,0,4,...,1,3,3,3,0,2,6,2,3,0
2,2,7,0,1,3,2,0,5,5,4,...,1,7,0,3,6,2,6,6,3,0
3,5,9,0,1,0,2,0,5,0,4,...,1,4,4,3,4,2,3,5,3,0
4,1,9,0,0,3,2,0,5,0,4,...,1,6,3,3,0,2,3,5,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,2,18,0,0,2,1,0,3,2,2,...,0,4,5,2,5,0,7,3,0,7
195,2,18,0,0,3,1,0,3,2,1,...,1,4,5,2,5,0,0,0,0,7
196,3,12,0,1,3,1,0,3,2,2,...,0,0,5,2,2,0,7,1,4,7
197,4,3,0,1,2,1,0,2,4,1,...,1,5,6,5,5,0,0,4,4,7
